!pip install --upgrade torchtext==0.10
#!pip install --upgrade torch==1.9

WARNING: Ignoring invalid distribution -orch (/usr/local/lib/python3.7/dist-packages)
WARNING: Ignoring invalid distribution -orch (/usr/local/lib/python3.7/dist-packages)
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: torchtext==0.10 in /usr/local/lib/python3.7/dist-packages (0.10.0)
Requirement already satisfied: torch==1.9.0 in /usr/local/lib/python3.7/dist-packages (from torchtext==0.10) (1.9.0)
Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from torchtext==0.10) (1.21.6)
Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from torchtext==0.10) (4.64.0)
Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from torchtext==0.10) (2.23.0)
Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch==1.9.0->torchtext==0.10) (4.1.1)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->torchtext==0.10) (2022.6.15)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->torchtext==0.10) (1.24.3)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->torchtext==0.10) (2.10)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->torchtext==0.10) (3.0.4)
WARNING: Ignoring invalid distribution -orch (/usr/local/lib/python3.7/dist-packages)


import torch
import torch.nn as nn
import torch.nn.functional as F

import math
import random
import numpy as np

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


################################### TODO ###################################
# Implement the scaled dot product attention described above.
############################################################################
def scaled_dot_product(q, k, v, attn_drop_rate=0.1, mask=None):
    """
    Parameters:
      q: query, shape: (batch, # heads, seq len, head dimension)
      k: keys, shape: (batch, # heads, seq len, head dimension)
      v: value, shape: (batch, # heads, seq len, head dimension)
      attn_drop_rate: probability of an element to be zeroed,
      mask: the optional masking of specific entries in the attention matrix.
              shape: (batch, seq len)
    """
    # TODO: get hidden dimensionality d_k for query/keys.
    assert q.shape[-1] == k.shape[-1]
    d_k = q.shape[-1]

    # TODO: compute (QK^T)/d_k, use https://pytorch.org/docs/stable/generated/torch.matmul.html.
    attn_logits = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)

    # TODO: if mask is not None, apply mask. use https://pytorch.org/docs/stable/generated/torch.Tensor.masked_fill_.html#torch.Tensor.masked_fill_.
    # Make sure that padding tokens cannot be attended to by subtracting a
    # large negative value from the columns of attention weights
    # corresponding to the tokens that have mask = 1. These will become 0
    # after the softmax.
    if mask is not None:
        attn_logits = attn_logits.masked_fill_(mask, -9e15)

    # TODO: compute softmax((QK^T)/d_k). Normalize attention weights to sum to 1 with a softmax over the key dimension.
    attention = torch.softmax(attn_logits, dim=-1)

    # TODO: Add dropout to attention weights w/ attn_drop_rate.
    attention = F.dropout(attention, p=attn_drop_rate)

    # TODO: compute softmax((QK^T)/d_k)V.
    values = torch.matmul(attention, v)

    return values, attention


bs = 1
num_heads = 1
seq_len, d_k = 3, 2
q = torch.randn(bs, num_heads, seq_len, d_k)
k = torch.randn(bs, num_heads, seq_len, d_k)
v = torch.randn(bs, num_heads, seq_len, d_k)
mask = torch.bernoulli(0.5 * torch.ones(bs, seq_len))
values, attention = scaled_dot_product(q, k, v, 0.0, mask)
print("Q\n", q)
print("K\n", k)
print("V\n", v)
print("Mask\n", mask)
print("Values\n", values)
print("Attention\n", attention)

Q
 tensor([[[[ 0.0461,  0.4024],
          [-1.0115,  0.2167],
          [-0.6123,  0.5036]]]])
K
 tensor([[[[ 0.2310,  0.6931],
          [-0.2669,  2.1785],
          [ 0.1021, -0.2590]]]])
V
 tensor([[[[-0.1549, -1.3706],
          [-0.1319,  0.8848],
          [-0.2611,  0.6104]]]])
Mask
 tensor([[0., 1., 0.]])
Values
 tensor([[[[-0.1690,  0.1293],
          [-0.1708,  0.2123],
          [-0.1631,  0.2468]]]])
Attention
 tensor([[[[0.3067, 0.4604, 0.2329],
          [0.2674, 0.4793, 0.2534],
          [0.2592, 0.5456, 0.1952]]]])


################################### TODO ###################################
# Implement Multi-head attention described above.
############################################################################
class MultiHeadAttention(nn.Module):
  def __init__(self, embed_dim, n_heads, attn_drop_rate):
    """
    Parameters:
      input_dim: The input dimension.
      embed_dim: The embedding dimension of the model
      n_heads: Number of attention heads
      attn_drop_rate: Dropout rate for attention weights (Q K^T)
    """
    super().__init__()
    self.embed_dim = embed_dim
    self.n_heads = n_heads
    self.head_dim = embed_dim // n_heads
    self.attn_drop_rate = attn_drop_rate

    # TODO: Add learnable parameters for computing query, key, and value using nn.Linear. 
    # Store all weight matrices W^Q, W^K, W^V 1...h together for efficiency.
    self.qkv_proj = nn.Linear(self.embed_dim, 3*self.embed_dim)

    # TODO: Add learnable parameters W^O using nn.Linear.
    self.o_proj = nn.Linear(self.embed_dim, self.embed_dim)

    self._reset_parameters()

  def _reset_parameters(self):
      # Original Transformer initialization, see PyTorch documentation
      nn.init.xavier_uniform_(self.qkv_proj.weight)
      self.qkv_proj.bias.data.fill_(0)
      nn.init.xavier_uniform_(self.o_proj.weight)
      self.o_proj.bias.data.fill_(0)
  
  def forward(self, embedding, mask):
    """
    Inputs:
      embedding: Input embedding with shape (batch size, sequence length, embedding dimension)
      mask: Mask specifying padding tokens with shape (batch_size, sequence length)
        Value for tokens that should be masked out is 1 and 0 otherwise.
    Outputs:
      Attended values
    """
    
    # TODO: get batch_size, seq_length, embed_dim.
    batch_size, seq_length, embed_dim = embedding.size()

    # TODO: Compute queries, keys, and values (keep continguous for now).
    qkv = self.qkv_proj(embedding)

    # TODO: Separate Q, K, V from linear output, give each shape [batch, num_head, seq_len, head_dim] (may require transposing/permuting dimensions)
    qkv = qkv.reshape(batch_size, seq_length, self.n_heads, 3*self.head_dim)
    qkv = qkv.permute(0, 2, 1, 3)
    q, k, v = qkv.chunk(3, dim=-1)

    # TODO: Determine value outputs, with shape [batch, seq_len, num_head, head_dim]. (hint: use scaled_dot_product())
    if mask is None:
        values, attention = scaled_dot_product(q, k, v, attn_drop_rate=self.attn_drop_rate)
    else:
        values, attention = scaled_dot_product(q, k, v, attn_drop_rate=self.attn_drop_rate,
                                               mask=mask.reshape([batch_size, 1, 1, seq_length]).repeat(1, self.n_heads, 1, 1))
    
    # TODO: Linearly project attention outputs w/ W^O.
    # The final dimensionality should match that of the inputs.
    values = values.permute(0, 2, 1, 3) # [Batch, SeqLen, Head, Dims]
    values = values.reshape(batch_size, seq_length, embed_dim)
    attended_embeds = self.o_proj(values)
    
    return attended_embeds


embed_dim = 16
n_heads = 4
attn_drop_rate = 0.1
layer = MultiHeadAttention(embed_dim, n_heads, attn_drop_rate)

bs = 3
seq_len = 2
inputs = torch.randn(bs, seq_len, embed_dim)
mask = torch.zeros(bs, seq_len)
outputs = layer(inputs, mask)
out_bs, out_seq_len, out_hidden = outputs.shape
print("Output shape: ", (out_bs, out_seq_len, out_hidden))
assert out_bs == bs and out_seq_len == seq_len and out_hidden == embed_dim, "Unexpected output shape"

Output shape:  (3, 2, 16)


################################### TODO ###################################
# Implement transformer encoder block
############################################################################
class TransformerBlock(nn.Module):
  def __init__(self, embed_dim, n_heads, attn_drop_rate, layer_drop_rate):
    """
    Parameters:
      input_dim: Dimensionality of the input
      embed_dim: The embedding dimension of the model
      n_heads: Number of attention heads
      attn_drop_rate: Dropout rate for attention weights (Q K^T)
      layer_drop_rate: Dropout rate for activations
    """
    super().__init__()
    self.embed_dim = embed_dim
    self.n_heads = n_heads
    self.layer_dropout = nn.Dropout(layer_drop_rate)

    # TODO: define attention layer
    self.self_attn = MultiHeadAttention(self.embed_dim, self.n_heads, attn_drop_rate)

    # TODO: define a network (using nn.Sequential) with: 
    # 1) a linear layer, 2) an activation layer, 3) another linear layer, 4) a dropout layer.
    self.linear_net = nn.Sequential(nn.Linear(self.embed_dim, self.embed_dim),
                                    nn.GELU(),
                                    nn.Linear(self.embed_dim, self.embed_dim),
                                    self.layer_dropout
                                   )
    
    # TODO: define 2 norm layers, 1 dropout layer.
    self.norm1 = nn.LayerNorm(self.embed_dim)
    self.norm2 = nn.LayerNorm(self.embed_dim)
    self.dropout = nn.Dropout(layer_drop_rate)

  def forward(self, inputs):
    embedding, mask = inputs

    # TODO: 1. compute multi-head attention
    attn_out = self.self_attn(embedding, mask=mask)

    # TODO: 2. add dropout
    dropout_out = self.dropout(attn_out)

    # TODO: 3. add residual connection to the input
    embedding = embedding + dropout_out

    # TODO: 4. apply layernorm
    embedding = self.norm1(embedding)

    # TODO: 5-8. compute 1) a linear layer, 2) an activation layer, 3) another linear layer, 4) a dropout layer.
    linear_out = self.linear_net(embedding)

    # TODO: 9. add residual connection 
    embedding = embedding + linear_out

    # TODO: 10. apply layer norm
    embedding = self.norm2(embedding)

    return embedding, mask


embed_dim = 16
n_heads = 4
attn_drop_rate = 0.1
layer_drop_rate = 0.1
block = TransformerBlock(embed_dim, n_heads, attn_drop_rate, layer_drop_rate)

bs = 3
seq_len = 2
embeds = torch.randn(bs, seq_len, embed_dim)
mask = torch.zeros(bs, seq_len)
outputs, _ = block((embeds, mask))
out_bs, out_seq_len, out_hidden = outputs.shape
print("Output shape: ", (out_bs, out_seq_len, out_hidden))
assert out_bs == bs and out_seq_len == seq_len and out_hidden == embed_dim, "Unexpected output shape"

Output shape:  (3, 2, 16)


class PositionalEncoding(nn.Module):

    def __init__(self, embed_dim: int, drop_rate=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=drop_rate)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2) * (-math.log(10000.0) / embed_dim))
        pe = torch.zeros(1, max_len, embed_dim)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor, shape [batch_size, seq_len, embedding_dim]
        """
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)


################################### TODO ###################################
# Add the requisite modules for a BERT model
############################################################################
class BertModel(nn.Module):
  def __init__(self, n_layers, vocab_size, embed_dim, n_heads, attn_drop_rate, layer_drop_rate):
    super().__init__()

    # TODO: 1. add input embedding layer (hint: use nn.Embedding) - don't forget about the mask token
    self.embed = nn.Embedding(vocab_size+1, embed_dim)
    
    # TODO: 2. add positional encoding
    self.pos_embed = PositionalEncoding(embed_dim)

    # TODO: 3. add stacked transformer blocks (use nn.Sequential)
    modules = []
    for i in range(n_layers):
        modules.append(TransformerBlock(embed_dim, n_heads, attn_drop_rate, layer_drop_rate))
    self.net = torch.nn.Sequential(*modules)
    
    # TODO: 4. add output linear layer that predicts masked words for pre-training
    self.mask_pred = nn.Linear(embed_dim, vocab_size)

  def forward(self, batch_text, mask=None):
    # TODO: implement forward pass (embedding -> stacked blocks -> output masked word predictions)
    embedding = self.embed(batch_text)
    embedding = self.pos_embed(embedding)
    transformer = self.net((embedding, mask))
    mask_preds = self.mask_pred(transformer[0])
    return mask_preds


embed_dim = 16
n_heads = 4
n_layers = 2
vocab_size = 10
attn_drop_rate = 0.1
layer_drop_rate = 0.1
model = BertModel(n_layers, vocab_size, embed_dim, n_heads, attn_drop_rate, layer_drop_rate)

bs = 3
seq_len = 2
inputs = torch.randint(0, vocab_size, (bs, seq_len))
mask_preds = model(inputs)
out_bs, out_seq_len, out_vocab = mask_preds.shape
print("Mask predictions shape: ", (out_bs, out_seq_len, out_vocab))
assert out_bs == bs and out_seq_len == seq_len and out_vocab == vocab_size, "Unexpected mask prediction output shape"

Mask predictions shape:  (3, 2, 10)


batch_size = 128
learning_rate = 1e-4
n_layers = 2  # number of transformer blocks in model
embed_dim = 64
n_heads = 4
attn_drop_rate = 0.1  # dropout rate on attention weights
layer_drop_rate = 0.1  # dropout rate on activations

mask_rate = 0.15  # rate at which we permute words in order to predict them
vocab_size = 100
MASK_TOKEN_IND = vocab_size
PAD_IND = 0
model = BertModel(n_layers, vocab_size, embed_dim, n_heads, attn_drop_rate, layer_drop_rate)
opt = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model.to(device)
model.train()

def mask_inputs(text, only_mask=False):
  """
  Inputs:
    text: Batch of sequences of shape (batch_size, seq_len) and type torch.Long
          Each token is represented by its index in the vocabulary.
    only_mask: If this is true, only replace tokens with <mask> tokens, no
               random tokens, or keeping tokens the same. This is used for
               evaluation only.
  Outputs:
    masked_text: Permuted inputs based on rules defined in description above.
    mask_labels: Labels for prediction. Use label -100 for tokens that we do not
                 want to predict. Should have the same shape as input text.

  """
  masked_text = text.clone()
  mask_labels = text.clone()
  ################################### TODO ###################################
  # Implement random permutation of tokens based on mask_rate, store the masked
  # sequences in masked_text. Note, you have access to mask_rate,
  # MASK_TOKEN_IND, etc. inside this function. Also store the prediction labels
  # for the pre-training task in mask_labels. Make sure to set the labels for
  # non-permuted tokens as well as padding tokens to -100
  ############################################################################
  if only_mask is True:
      mask_idx = (masked_text == MASK_TOKEN_IND).to(device)
      masked_text = masked_text.masked_fill_(mask_idx, MASK_TOKEN_IND)
      mask_labels = mask_labels.masked_fill_(mask_idx, -100)
  else:
      mask_idx = (torch.rand(masked_text.size())).to(device)
      mask_label_change = (mask_idx >= mask_rate).to(device)
      mask_idx_mask = (mask_idx < mask_rate * 0.8).to(device)
      mask_idx_random = ((mask_idx < mask_rate * 0.9) & (mask_idx >= mask_rate * 0.8)).to(device)
      mask_idx_keep = ((mask_idx < mask_rate) & (mask_idx >= mask_rate * 0.9)).to(device)
      random_mask = torch.randint(0, vocab_size, masked_text.size()).to(device)

      masked_text = masked_text.masked_fill_(mask_idx_mask.type(torch.bool), MASK_TOKEN_IND)
      masked_text = torch.where(mask_idx_random.type(torch.bool), random_mask, masked_text)
      mask_labels = mask_labels.masked_fill_(mask_label_change.type(torch.bool), -100)

  ################################ END TODO ##################################
  return masked_text, mask_labels

text = torch.randint(1, vocab_size, (batch_size, 128)).to(device)
pad_mask = (text == PAD_IND).to(torch.uint8).to(device)  # this is a different type of mask (used to prevent attending to padding tokens)
masked_text, mask_labels = mask_inputs(text)

changed = (text != masked_text)
masked = (masked_text == MASK_TOKEN_IND)
print("Proportion of text changed (should be around 0.135): ", changed.float().mean().cpu().item())
print("Proportion of text masked (should be around 0.12): ", masked.float().mean().cpu().item())

labeled = (mask_labels != -100)
print("Proportion of data labeled for pre-training (should be around 0.15)", labeled.float().mean().cpu().item())

mask_preds = model(masked_text, pad_mask)
loss_fn = nn.CrossEntropyLoss()
loss = loss_fn(mask_preds.reshape((-1, vocab_size)), mask_labels.flatten())
opt.zero_grad()
loss.backward()
opt.step()
print("Training step successfully completed! Loss value (should be around 4.6): ", loss.cpu().item())

Proportion of text changed (should be around 0.135):  0.13201904296875
Proportion of text masked (should be around 0.12):  0.1163330078125
Proportion of data labeled for pre-training (should be around 0.15) 0.14666748046875
Training step successfully completed! Loss value (should be around 4.6):  4.752349376678467

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:27: UserWarning: masked_fill_ received a mask with dtype torch.uint8, this behavior is now deprecated,please use a mask with dtype torch.bool instead. (Triggered internally at  /pytorch/aten/src/ATen/native/cuda/Indexing.cu:937.)


!pip install transformers

from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

WARNING: Ignoring invalid distribution -orch (/usr/local/lib/python3.7/dist-packages)
WARNING: Ignoring invalid distribution -orch (/usr/local/lib/python3.7/dist-packages)
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: transformers in /usr/local/lib/python3.7/dist-packages (4.20.0)
Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.7/dist-packages (from transformers) (6.0)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from transformers) (21.3)
Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.21.6)
Requirement already satisfied: huggingface-hub<1.0,>=0.1.0 in /usr/local/lib/python3.7/dist-packages (from transformers) (0.7.0)
Requirement already satisfied: tokenizers!=0.11.3,<0.13,>=0.11.1 in /usr/local/lib/python3.7/dist-packages (from transformers) (0.12.1)
Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers) (4.11.4)
Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.64.0)
Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)
Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.7.1)
Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2022.6.2)
Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0,>=0.1.0->transformers) (4.1.1)
Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->transformers) (3.0.9)
Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers) (3.8.0)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2022.6.15)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.24.3)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)
WARNING: Ignoring invalid distribution -orch (/usr/local/lib/python3.7/dist-packages)


init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']


def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

from torchtext.legacy import data

TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

LABEL = data.LabelField(dtype = torch.float)

from torchtext.legacy import datasets

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

LABEL.build_vocab(train_data)

print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


BATCH_SIZE = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)


from transformers import BertTokenizer, BertModel

bert = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


from inspect import Parameter
import torch.nn as nn

class BERTGRUSentiment(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        """
        Parameters: 
          bert: pre-trained BERT model
          hidden_dim: hidden dimensionality of GRU
          output_dim: output dimensionality of output linear layer (when non-bidirectional)
          n_layers: number of GRU layers
          bidirectional: True if GRU is bi-directional, False if otherwise.
          dropout: dropout rate for the dropout layer
        """
        super().__init__()

        self.bert = bert
        self.bidirectional = bidirectional
        
        # TODO: get the embedding dimension size 'hidden_size' from the transformer via its config attribute
        embedding_dim = bert.config.hidden_size
        
        # TODO: add an n_layers GRU (you may use nn.GRU) - make sure to include kwargs 'bidirectional', 'batch_first=True', and 'dropout'
        self.rnn = nn.GRU(input_size = embedding_dim, hidden_size = hidden_dim,
                          num_layers = n_layers, bidirectional = bidirectional, batch_first = True)
        
        # TODO: add output linear layer (recall that we concatenate two hidden vectors when using bidirectional GRU)
        self.out = nn.Linear((1 + self.bidirectional) * hidden_dim, output_dim)

        # TODO: add dropout layer
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        # TODO: Compute the forward pass of the transformer inside a `torch.no_grad()` context.
        with torch.no_grad():
            embedding = bert(text)[0]
                
        # TODO: pass embeddings through recurrent network
        _, hidden = self.rnn(embedding)

        # TODO: Select the hidden state to use - last step for unidirectional -
        # last step of forward and backward iteration concatenated for bidirectional
        # (hint: look at the docs for nn.GRU - https://pytorch.org/docs/stable/generated/torch.nn.GRU.html)
        if self.bidirectional:
            hidden = torch.cat((hidden[-1,:,:], hidden[-2,:,:]), dim = 1)
        else:
            hidden = hidden[-1,:,:]

        # TODO: pass through dropout layer
        hidden = self.dropout(hidden)
        
        # TODO: pass through output linear layer
        output = self.out(hidden)
        
        return output


################################### TODO ###################################
# Adjust these hyperparameters as you see fit
############################################################################
HIDDEN_DIM = 256
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25
LEARNING_RATE = 3e-4
N_EPOCHS = 5
################################ END TODO ##################################

model = BERTGRUSentiment(bert,
                         HIDDEN_DIM,
                         1,
                         N_LAYERS,
                         BIDIRECTIONAL,
                         DROPOUT)

for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False


import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()

# Place the model and criterion onto the GPU (if available)
model = model.to(device)
criterion = criterion.to(device)


def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    # TODO: compute the binary_accuracy
    rounded_preds = torch.round(torch.sigmoid(preds))
    corrections = (rounded_preds == y).float()
    acc = corrections.sum() / len(corrections)
    return acc

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
        
    end_time = time.time()
        
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 14m 49s
	Train Loss: 0.445 | Train Acc: 79.05%
	 Val. Loss: 0.306 |  Val. Acc: 87.62%
Epoch: 02 | Epoch Time: 14m 48s
	Train Loss: 0.278 | Train Acc: 88.72%
	 Val. Loss: 0.236 |  Val. Acc: 90.59%
Epoch: 03 | Epoch Time: 14m 49s
	Train Loss: 0.238 | Train Acc: 90.46%
	 Val. Loss: 0.254 |  Val. Acc: 89.60%
Epoch: 04 | Epoch Time: 14m 50s
	Train Loss: 0.215 | Train Acc: 91.51%
	 Val. Loss: 0.267 |  Val. Acc: 88.91%
Epoch: 05 | Epoch Time: 14m 49s
	Train Loss: 0.187 | Train Acc: 92.81%
	 Val. Loss: 0.233 |  Val. Acc: 90.46%


model.load_state_dict(torch.load('best-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.226 | Test Acc: 90.86%


def predict_sentiment(model, tokenizer, sentence):
    model.eval()
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

predict_sentiment(model, tokenizer, "This film is terrible")

predict_sentiment(model, tokenizer, "This film is great")

0.9849628806114197

Transformers for Language Modeling and Sentiment Analysis¶

1 - Scaled Dot Product Attention [8 points]¶

2 - Build Multi-Head-Attention Layer [8 points]¶

3 - Build Transformer Blocks [8 points]¶

4 - Position Encoding [0 points]¶

5 - Build a BERT model [8 points]¶

6 - Implement BERT Pre-Training [8 points]¶

7 - Fine-Tune Pre-Trained Model on Sentiment Analysis [8 points]¶

Preparing Data¶

Build the Model¶

Train the Model¶

Inference¶

Conceptual Questions¶

End¶