LLM Fine Tune
LLM Fine Tune
class PositionalEncoding(nn.Module):
"""
https://pytorch.org/tutorials/beginner/transformer_tutorial.html
"""
def __init__(self, d_model, vocab_size=5000, dropout=0.1):
super().__init__()
Pivotal in transformers and self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(vocab_size, d_model)
sequence-to-sequence position = torch.arange(0, vocab_size, dtype=torch.float).unsq
models, conveying critical div_term = torch.exp(
Positional encoding information regarding the torch.arange(0, d_model, 2).float()
positions or sequencing of * (-math.log(10000.0) / d_model)
elements within a given )
pe[:, 0::2] = torch.sin(position * div_term)
sequence. pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer("pe", pe)
def forward(self, x):
x = x + self.pe[:, : x.size(1), :]
return self.dropout(x)
class GloVe_override(Vectors):
url = {
"6B": "https://cf-courses-data.s3.us.cloud-object-storage.appd
}
def __init__(self, name="6B", dim=100, **kwargs) -> None:
url = self.url[name]
name = "glove.{}.{}d.txt".format(name, str(dim))
An unsupervised learning #name = "glove.{}/glove.{}.{}d.txt".format(name, name, str(dim
algorithm to obtain vector super(GloVe_override, self).__init__(name, url=url, **kwargs)
class GloVe_override2(Vectors):
representations for words. url = {
GloVe model is trained on "6B": "https://cf-courses-data.s3.us.cloud-object-storage.appd
the aggregated global word- }
GloVe embeddings def __init__(self, name="6B", dim=100, **kwargs) -> None:
to-word co-occurrence
statistics from a corpus, and url = self.url[name]
#name = "glove.{}.{}d.txt".format(name, str(dim))
the resulting representations name = "glove.{}/glove.{}.{}d.txt".format(name, name, str(dim)
show linear substructures of super(GloVe_override2, self).__init__(name, url=url, **kwargs)
the word vector base. try:
glove_embedding = GloVe_override(name="6B", dim=100)
except:
try:
glove_embedding = GloVe_override2(name="6B", dim=100)
except:
glove_embedding = GloVe(name="6B", dim=100)
about:blank 1/11
19/01/2025, 18:50 about:blank
ATCH_SIZE = 32
train_dataloader = DataLoader(
Used in PyTorch-based split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=coll
projects. It includes creating )
Convert the data set objects to data data set objects, specifying valid_dataloader = DataLoader(
split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=coll
loaders data loading parameters, and )
converting these data sets test_dataloader = DataLoader(
into data loaders. test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=coll
)
Training function Helps in the training model, def train_model(model, optimizer, criterion, train_dataloader, valid_d
iteratively update the model's cum_loss_list = []
acc_epoch = []
parameters to minimize the acc_old = 0
loss function. It improves the model_path = os.path.join(save_dir, file_name)
model's performance on a acc_dir = os.path.join(save_dir, os.path.splitext(file_name)[0] +
given task. loss_dir = os.path.join(save_dir, os.path.splitext(file_name)[0] +
time_start = time.time()
for epoch in tqdm(range(1, epochs + 1)):
model.train()
#print(model)
#for parm in model.parameters():
# print(parm.requires_grad)
cum_loss = 0
for idx, (label, text) in enumerate(train_dataloader):
optimizer.zero_grad()
label, text = label.to(device), text.to(device)
predicted_label = model(text)
loss = criterion(predicted_label, label)
loss.backward()
#print(loss)
torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
optimizer.step()
cum_loss += loss.item()
print(f"Epoch {epoch}/{epochs} - Loss: {cum_loss}")
cum_loss_list.append(cum_loss)
about:blank 2/11
19/01/2025, 18:50 about:blank
train_iter_ag_news = AG_NEWS(split="train")
num_class_ag_news = len(set([label for (label, text) in train_iter_ag_
num_class_ag_news
# Split the dataset into training and testing iterators.
train_iter_ag_news, test_iter_ag_news = AG_NEWS()
# Convert the training and testing iterators to map-style datasets.
train_dataset_ag_news = to_map_style_dataset(train_iter_ag_news)
test_dataset_ag_news = to_map_style_dataset(test_iter_ag_news)
# Determine the number of samples to be used for training and validati
num_train_ag_news = int(len(train_dataset_ag_news) * 0.95)
# Randomly split the training dataset into training and validation dat
# The training dataset will contain 95% of the samples, and the valida
split_train_ag_news_, split_valid_ag_news_ = random_split(train_datase
# Make the training set smaller to allow it to run fast as an example.
# IF YOU WANT TO TRAIN ON THE AG_NEWS DATASET, COMMENT OUT THE 2 LINEs
# HOWEVER, NOTE THAT TRAINING WILL TAKE A LONG TIME
Fine-tuning a model on the num_train_ag_news = int(len(train_dataset_ag_news) * 0.05)
pretrained AG News data set split_train_ag_news_, _ = random_split(split_train_ag_news_, [num_trai
is to categorize news articles device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
into one of four categories: device
def label_pipeline(x):
Sports, Business, Sci/Tech, or return int(x) - 1
World. Start training a model from torch.nn.utils.rnn import pad_sequence
from scratch on the AG News def collate_batch_ag_news(batch):
data set. If you want to train label_list, text_list = [], []
Fine-tune a model in the AG News the model for 2 epochs on a for _label, _text in batch:
label_list.append(label_pipeline(_label))
data set smaller data set to text_list.append(torch.tensor(text_pipeline(_text), dtype=torch.in
demonstrate what the training label_list = torch.tensor(label_list, dtype=torch.int64)
process would look like, text_list = pad_sequence(text_list, batch_first=True)
uncomment the part that says return label_list.to(device), text_list.to(device)
### Uncomment to Train ### BATCH_SIZE = 32
train_dataloader_ag_news = DataLoader(
before running the cell. split_train_ag_news_, batch_size=BATCH_SIZE, shuffle=True, collate
Training for 2 epochs on the )
reduced data set can take valid_dataloader_ag_news = DataLoader(
approximately 3 minutes. split_valid_ag_news_, batch_size=BATCH_SIZE, shuffle=True, collate
)
test_dataloader_ag_news = DataLoader(
test_dataset_ag_news, batch_size=BATCH_SIZE, shuffle=True, collate
)
model_ag_news = Net(num_class=4,vocab_size=vocab_size).to(device)
model_ag_news.to(device)
'''
### Uncomment to Train ###
LR=1
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model_ag_news.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
save_dir = ""
file_name = "model_AG News small1.pth"
train_model(model=model_ag_news, optimizer=optimizer, criterion=criter
about:blank 3/11
19/01/2025, 18:50 about:blank
class FeatureAdapter(nn.Module):
"""
FeatureAdapter is a neural Attributes:
network module that size (int): The bottleneck dimension to which the embeddings a
model_dim (int): The original dimension of the embeddings or f
introduces a low-dimensional """
bottleneck in a transformer def __init__(self, bottleneck_size=50, model_dim=100):
architecture to allow fine- super().__init__()
tuning with fewer self.bottleneck_transform = nn.Sequential(
parameters. It compresses the nn.Linear(model_dim, bottleneck_size), # Down-project to
nn.ReLU(), # Apply non-lineari
original high-dimensional nn.Linear(bottleneck_size, model_dim) # Up-project back t
embeddings into a lower )
Adaptor model dimension, applies a def forward(self, x):
nonlinear transformation, and """
then expands it back to the Forward pass of the FeatureAdapter. Applies the bottleneck tra
tensor and adds a skip connection.
original dimension. This Args:
process is followed by a x (Tensor): Input tensor with shape (batch_size, seq_lengt
residual connection that adds Returns:
the transformed output back Tensor: Output tensor after applying the adapter transform
to the original input to maintaining the original input shape.
"""
preserve information and transformed_features = self.bottleneck_transform(x) # Transfo
promote gradient flow. output_with_residual = transformed_features + x # Add the
return output_with_residual
class IMDBDataset(Dataset):
def __init__(self, root_dir, train=True):
"""
root_dir: The base directory of the IMDB dataset.
train: A boolean flag indicating whether to use training or te
"""
This code snippet traverses self.root_dir = os.path.join(root_dir, "train" if train else "
the IMDB data set by self.neg_files = [os.path.join(self.root_dir, "neg", f) for f
obtaining, loading, and self.pos_files = [os.path.join(self.root_dir, "pos", f) for f
exploring the data set. It also self.files = self.neg_files + self.pos_files
Traverse the IMDB data set self.labels = [0] * len(self.neg_files) + [1] * len(self.pos_f
performs basic operations, self.pos_inx=len(self.pos_files)
visualizes the data, and def __len__(self):
analyzes and interprets the return len(self.files)
data set. def __getitem__(self, idx):
file_path = self.files[idx]
label = self.labels[idx]
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
return label, content
Training a BERT model for MLM task This code snippet trains the training_args = TrainingArguments(
model with the specified output_dir="./trained_model", # Specify the output directory for
overwrite_output_dir=True,
parameters and data set. do_eval=False,
However, ensure that the learning_rate=5e-5,
about:blank 4/11
19/01/2025, 18:50 about:blank
# Generate text
output_ids = model.generate(
inputs.input_ids,
attention_mask=inputs.attention_mask,
pad_token_id=tokenizer.eos_token_id,
This code snippet generates max_length=50,
text sequences based on the num_return_sequences=1
Generate text
input and doesn't compute the )
gradient to generate output. output_ids
or
with torch.no_grad():
outputs = model(**inputs)
outputs
about:blank 5/11
19/01/2025, 18:50 about:blank
class ListDataset(Dataset):
def __init__(self, original_list):
Inherits from Dataset and self.original_list = original_list
creates a torch Dataset from a def __len__(self):
ListDataset list. This class is then used to return len(self.original_list)
generate a Dataset object def __getitem__(self, i):
from instructions. return self.original_list[i]
instructions_torch = ListDataset(instructions)
gen_pipeline = pipeline("text-generation",
model=model,
This code snippet takes the tokenizer=tokenizer,
token IDs from the model device=device,
batch_size=2,
gen_pipeline output, decodes it from the max_length=50,
table text, and prints the truncation=True,
responses. padding=False,
return_full_text=False)
with torch.no_grad():
# Due to resource limitation, only apply the function on 3 records
This code generates text from
pipeline_iterator= gen_pipeline(instructions_torch[:3],
the given input using a max_length=50, # this is set to 50
pipeline while optimizing num_beams=5,
torch.no_grad()
resource usage by limiting early_stopping=True,)
input size and reducing generated_outputs_base = []
for text in pipeline_iterator:
gradient calculations. generated_outputs_base.append(text[0]["generated_text"])
training_args = SFTConfig(
output_dir="/tmp",
num_train_epochs=10,
save_strategy="epoch",
fp16=True,
This code snippet sets and per_device_train_batch_size=2, # Reduce batch size
per_device_eval_batch_size=2, # Reduce batch size
initializes a training
max_seq_length=1024,
configuration for a model do_eval=True
using 'SFTTrainer' by )
SFTTrainer
specifying parameters and trainer = SFTTrainer(
initializes the 'SFTTrainer' model,
train_dataset=train_dataset,
with the model, datasets, and
eval_dataset=test_dataset,
additional settings. formatting_func=formatting_prompts_func,
args=training_args,
packing=False,
data_collator=collator,
)
about:blank 6/11
19/01/2025, 18:50 about:blank
def plot_matrix_and_subspace(F):
assert F.shape[0] == 3, "Matrix F must have rows equal to 3 for 3D
ax = plt.figure().add_subplot(projection='3d')
# Plot each column vector of F as a point and line from the origin
for i in range(F.shape[1]):
ax.quiver(0, 0, 0, F[0, i], F[1, i], F[2, i], color='blue', ar
if F.shape[1] == 2:
# Calculate the normal to the plane spanned by the columns of
normal_vector = np.cross(F[:, 0], F[:, 1])
# Plot the plane
The code snippet is useful for xx, yy = np.meshgrid(np.linspace(-3, 3, 10), np.linspace(-3, 3
def plot_matrix_and_subspace(F) understanding the vectors in zz = (-normal_vector[0] * xx - normal_vector[1] * yy) / normal
the 3D space. ax.plot_surface(xx, yy, zz, alpha=0.5, color='green', label='S
# Set plot limits and labels
ax.set_xlim([-3, 3])
ax.set_ylim([-3, 3])
ax.set_zlim([-3, 3])
ax.set_xlabel('$x_{1}$')
ax.set_ylabel('$x_{2}$')
ax.set_zlabel('$x_{3}$')
#ax.legend()
plt.show()
class LoRALayer(torch.nn.Module):
The provided code is useful def __init__(self, in_dim, out_dim, rank, alpha):
super().__init__()
for defining the parameters of std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
the 'LoRALayer' module self.A = torch.nn.Parameter(torch.randn(in_dim, rank) * std_de
nn.Parameter during the training. The self.B = torch.nn.Parameter(torch.zeros(rank, out_dim))
'LoRALayer' has been used self.alpha = alpha
as an intermediate layer in a def forward(self, x):
x = self.alpha * (x @ self.A @ self.B)
simple neural network. return x
about:blank 7/11
19/01/2025, 18:50 about:blank
dataset_name = "imdb"
ds = load_dataset(dataset_name, split = "train")
The data set is loaded using N = 5
the load_dataset function for sample in range(N):
print('text',ds[sample]['text'])
load_dataset from the data set's library,
print('label',ds[sample]['label'])
specifically loading the ds = ds.rename_columns({"text": "review"})
"train" split. ds
ds = ds.filter(lambda x: len(x["review"]) > 200, batched=False)
about:blank 8/11
19/01/2025, 18:50 about:blank
del(ds)
dataset_name="imdb"
ds = load_dataset(dataset_name, split="train")
ds = ds.rename_columns({"text": "review"})
def build_dataset(config, dataset_name="imdb", input_min_text_length=2
"""
Build dataset for training. This builds the dataset from `load_dat
customize this function to train the model on its own dataset.
Args:
dataset_name (`str`):
The name of the dataset to be loaded.
Returns:
dataloader (`torch.utils.data.DataLoader`):
Incorporates the necessary The dataloader for the dataset.
steps to build a data set """
build_dataset
object for use as an input to tokenizer = AutoTokenizer.from_pretrained(config.model_name)
PPOTrainer. tokenizer.pad_token = tokenizer.eos_token
# load imdb with datasets
ds = load_dataset(dataset_name, split="train")
ds = ds.rename_columns({"text": "review"})
ds = ds.filter(lambda x: len(x["review"]) > 200, batched=False)
input_size = LengthSampler(input_min_text_length, input_max_text_l
def tokenize(sample):
sample["input_ids"] = tokenizer.encode(sample["review"])[: inp
sample["query"] = tokenizer.decode(sample["input_ids"])
return sample
ds = ds.map(tokenize, batched=False)
ds.set_format(type="torch")
return ds
This code snippet defines a # Instantiate a tokenizer using the BERT base cased model
function tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
'compare_models_on_dataset' # Define a function to tokenize examples
for comparing the def tokenize_function(examples):
performance of two models # Tokenize the text using the tokenizer
Tokenizing data # Apply padding to ensure all sequences have the same length
by initializing generation # Apply truncation to limit the maximum sequence length
parameters and setting the return tokenizer(examples["text"], padding="max_length", truncatio
batch size, preparing the data # Apply the tokenize function to the dataset in batches
set in the pandas format, and tokenized_datasets = dataset.map(tokenize_function, batched=True)
sampling the batch queries.
def train_model(model,tr_dataloader):
# Create a progress bar to track the training progress
The train_model function progress_bar = tqdm(range(num_training_steps))
trains a model using a set of # Set the model in training mode
training data provided model.train()
tr_losses=[]
through a dataloader. It
# Training loop
begins by setting up a for epoch in range(num_epochs):
progress bar to help monitor total_loss = 0
the training progress visually. # Iterate over the training data batches
The model is switched to for batch in tr_dataloader:
# Move the batch to the appropriate device
training mode, which is
batch = {k: v.to(device) for k, v in batch.items()}
necessary for certain model # Forward pass through the model
behaviors like dropout to outputs = model(**batch)
work correctly during # Compute the loss
training. The function loss = outputs.loss
# Backward pass (compute gradients)
Training loop processes the data in batches
loss.backward()
for each epoch, which total_loss += loss.item()
involves several steps for # Update the model parameters
each batch: transferring the optimizer.step()
data to the correct device # Update the learning rate scheduler
lr_scheduler.step()
(like a GPU), running the
# Clear the gradients
data through the model to get optimizer.zero_grad()
outputs and calculate loss, # Update the progress bar
updating the model's progress_bar.update(1)
parameters using the tr_losses.append(total_loss/len(tr_dataloader))
#plot loss
calculated gradients, plt.plot(tr_losses)
adjusting the learning rate, plt.title("Training loss")
and clearing the old plt.xlabel("Epoch")
gradients. plt.ylabel("Loss")
plt.show()
about:blank 9/11
19/01/2025, 18:50 about:blank
about:blank 10/11
19/01/2025, 18:50 about:blank
config_bnb = BitsAndBytesConfig(
load_in_4bit=True, # quantize the model to 4-bits when you load it
bnb_4bit_quant_type="nf4", # use a special 4-bit data type for wei
Defines the quantization bnb_4bit_use_double_quant=True, # nested quantization scheme to qu
Configure BitsAndBytes
parameters. bnb_4bit_compute_dtype=torch.bfloat16, # use bfloat16 for faster c
llm_int8_skip_modules=["classifier", "pre_classifier"] # Don't co
)
about:blank 11/11