from transformers import BertTokenizer, BertModel import torch # Load tokenizer and model tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True) # Encode input text inputs = tokenizer("Example sentence for BERT attention visualization.", return_tensors="pt") # Forward pass, get attentions outputs = model(**inputs) attentions = outputs.attentions # Tuple of attention tensors for each layer # Get the token index for a word of interest, e.g., "attention" token_id = tokenizer.convert_tokens_to_ids("attention") # Find the positions of this token in the input sequence token_positions = (inputs['input_ids'][0] == token_id).nonzero(as_tuple=True)[0] # Access the attention from one of these positions, e.g., first layer, first head attention_layer_head = attentions[0][0, :, token_positions[0], :] # Now `attention_layer_head` contains the attention weights from the word "attention" to all other tokens in this specific layer and head print(attention_layer_head)