GPT2 From Scratch in PyTorch
GPT2 From Scratch in PyTorch
Scratch
The purpose of this notebook is to guide through the process of building a Generative Pre-trained
Transformer 2 (GPT-2) model from scratch. GPT-2 was a state-of-the-art language generation model
developed by OpenAI, which has been trained on a large corpus of text data and can generate coherent and
contextually relevant text.
The impact of this model has been significant, as it has demonstrated the ability to generate human-like text
and perform well on a variety of natural language processing tasks. In this notebook, we will explore the
architecture of the GPT-2 model, train it on a text dataset, and evaluate its performance on a text generation
task.
This is the newer Decoder-only version of the Transformer architecture, which is used for language modeling
tasks. The Transformer architecture has been widely adopted in the field of natural language processing due
to its ability to capture long-range dependencies in text data and its parallelizable nature.
This is the original Transformer architecture, which consists of an encoder and a decoder.
The encoder processes the input sequence and generates a sequence of hidden states, while the decoder
generates the output sequence based on the encoder's hidden states and the previous output tokens.
The notebook will cover the following topics:
Overview of the Transformer architecture: Understand the key components of the Transformer architecture,
including self-attention mechanisms and feedforward neural networks.
Data preparation: Learn how to prepare and preprocess the text data for training the GPT-2 model.
Training: Train the GPT-2 model using the preprocessed text data and optimize its parameters to minimize a
suitable loss function.
Inference: Generate text using the trained GPT-2 model and evaluate its performance on a text generation
task.
References:
In [ ]: import math
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
1. Input Embeddings
In [ ]: class InputEmbedding(nn.Module):
def __init__(self, embed_dim: int, vocab_size: int):
"""
Initialize the InputEmbedding module.
Args:
embed_dim (int): The dimensionality of the input embedding.
vocab_size (int): The size of the vocabulary.
"""
super().__init__()
# Store the dimensionality and vocabulary size
self.embed_dim = embed_dim
self.vocab_size = vocab_size
Args:
x (tensor): The input tensor.
Returns:
tensor: The embedded input tensor after scaling it by the square root of the dimensio
"""
# Embed the input tensor using the embedding layer
# Shape: (batch_size, seq_len) -> (batch_size, seq_len, embed_dim)
embedded_input = self.embedding(x)
# Scale the embedded input tensor by the square root of the dimensionality
# Shape: (batch_size, seq_len, embed_dim) -> (batch_size, seq_len, embed_dim)
scaled_embedded_input = embedded_input * torch.sqrt(torch.tensor(self.embed_dim))
return scaled_embedded_input
2. Positional Encoding
In [ ]: class PositionalEncoding(nn.Module):
def __init__(self, embed_dim: int = 512, max_seq_len: int = 100, dropout: float = 0.1,):
"""Initialize the PositionalEncoding module."""
super().__init__()
self.embed_dim = embed_dim
self.max_seq_len = max_seq_len
self.dropout = nn.Dropout(dropout)
# Precompute the positional encoding matrix
self.positional_encoding = self._precompute_positional_encoding(max_seq_len, embed_dim)
return positional_encoding
3. Layer Normalization
In [ ]: class LayerNormalization(nn.Module):
def __init__(self, embed_dim: int, eps: float = 1e-6):
"""Initialize the LayerNormalization module."""
super().__init__()
self.eps = eps
# Create two learnable parameters to scale and shift the normalized input
self.gain = nn.Parameter(torch.Tensor(embed_dim).uniform_()) # Initialize with values sa
self.bias = nn.Parameter(torch.Tensor(embed_dim).normal_()) # Initialize with values s
In [ ]: class MultiHeadAttention(nn.Module):
def __init__(self, embed_dim: int = 512, num_heads: int = 8, attn_dropout: float = 0.1, ff_dr
super().__init__()
self.num_heads = num_heads
assert embed_dim % self.num_heads == 0, "invalid heads and embedding dimension configurat
self.key = nn.Linear(embed_dim, embed_dim)
self.value = nn.Linear(embed_dim, embed_dim)
self.query = nn.Linear(embed_dim, embed_dim)
self.proj = nn.Linear(embed_dim, embed_dim)
self.attn_dropout = nn.Dropout(attn_dropout)
self.proj_dropout = nn.Dropout(ff_dropout)
# Create a buffer to store the mask with no grad
# Shape: (1, max_len, max_len)
self.register_buffer(
"mask",
torch.triu(torch.ones(max_len, max_len, dtype=torch.bool), diagonal=1)
)
6. Residual Connection
In [ ]: class ResidualConnection(nn.Module):
def __init__(self, embed_dim, dropout: float = 0.1):
"""Initialize the ResidualConnection module."""
super().__init__()
self.layer_norm = LayerNormalization(embed_dim=embed_dim)
self.dropout = nn.Dropout(dropout)
7. Projection Head
In [ ]: class ProjectionHead(nn.Module):
def __init__(self, embed_dim: int, vocab_size: int):
"""Initialize the ProjectionHead module."""
super().__init__()
self.fc = nn.Linear(embed_dim, vocab_size)
8. Transformer Block
In [ ]: class DecoderBlock(nn.Module):
def __init__(
self,
embed_dim: int = 512,
num_heads: int = 8,
ff_dim: int = 2048,
attn_dropout: float = 0.1,
ff_dropout: float = 0.1,
dropout: float = 0.1,
max_len: int = 512,
):
super().__init__()
# Initialize multi-head self-attention mechanism
self.MultiHeadAttention = MultiHeadAttention(
embed_dim=embed_dim,
num_heads=num_heads,
attn_dropout=attn_dropout,
ff_dropout=ff_dropout,
max_len=max_len,
)
# Initialize feed-forward block
self.feed_forward = FeedForwardBlock(
embed_dim=embed_dim,
intermediate_size=ff_dim,
dropout=ff_dropout,
)
# Initialize residual connections
self.residual_connection1 = ResidualConnection(embed_dim=embed_dim, dropout=dropout)
self.residual_connection2 = ResidualConnection(embed_dim=embed_dim, dropout=dropout)
# Token embedding
# Shape: (batch_size, seq_len) -> (batch_size, seq_len, embed_dim)
x = self.token_embedding(input_ids) # (batch_size, seq_len, embed_dim)
return x
In [ ]: class GPTDataset(Dataset):
def __init__(self, data:list, tokenizer, max_length:int):
self.data = data
self.tokenizer = tokenizer
self.max_length = max_length
self.end_token = tokenizer.eos_token_id
def __len__(self):
return len(self.data)
In [ ]: tokenizer = AutoTokenizer.from_pretrained("gpt2")
train_dataset = GPTDataset(
data = sample_data,
tokenizer = tokenizer,
max_length = 200,
)
print("Label:", label)
print("Input IDs:", input_ids)
In [ ]: model.to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,)
# Forward pass
logits = model(input_ids=input_ids, attention_mask=mask)
total_loss += loss.item()
12. Inference
In [ ]: vocab_size = 50257
embed_dim = 768
max_len = 1024
embed_dropout = 0.1
num_blocks = 12 # or 24 for GPT-2 XL
num_heads = 12 # or 24 for GPT-2 XL
ff_dim = 3072
attn_dropout = 0.1
ff_dropout = 0.1
In [ ]: model_name = "gpt2"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name)
In [ ]: model = model.to(device)
iterations = []
n_steps = 10
choices_per_step = 5
with torch.no_grad():
for _ in range(n_steps):
iteration = dict()
iteration["Input"] = tokenizer.decode(input_ids[0])
output = model(input_ids=input_ids)
# Select logits of the first batch and the last token and apply softmax to get the probab
next_token_logits = output[0, -1, :]
next_token_probs = torch.softmax(next_token_logits, dim=-1)
sorted_ids = torch.argsort(next_token_probs, dim=-1, descending=True)
sample_inference = pd.DataFrame(iterations)
sample_inference.head()
In [ ]: def generate_text_until_end(
input_text:str,
model:GPT,
tokenizer:AutoTokenizer,
max_length:int=100,
device='cpu',
):
model = model.to(device)
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
end_token_id = tokenizer.eos_token_id
generated_ids = input_ids.flatten().clone() # Convert to 1-dimensional tensor
with torch.no_grad():
while True:
output = model(input_ids=input_ids)
next_token_logits = output[:, -1, :]
# Apply softmax to get probabilities but probably not necessary
# because the max value will still be the max value after softmax
# next_token_probs = torch.softmax(next_token_logits, dim=-1)
next_token_id = torch.argmax(next_token_logits, dim=-1)
generated_ids = torch.cat([generated_ids, next_token_id], dim=-1)
input_ids = next_token_id.unsqueeze(0)
In [ ]: # Example usage:
generated_text = generate_text_until_end(
input_text="I like to eat",
model=model,
tokenizer=tokenizer,
max_length=20,
device=device,
)
print(generated_text)