from pathlib import Path
from urllib.request import Request, urlopen
import json
import shutil
import zipfile
RESOURCE_MANIFESTS = [
"https://assets.deeplearningnotes.com/code-support-resources/datasets/books/latest.json",
]
def download_file(url, path):
request = Request(url, headers={"User-Agent": "Mozilla/5.0"})
with urlopen(request) as response, open(path, "wb") as file:
shutil.copyfileobj(response, file)
def extract_zip_safely(zip_path, target_dir="."):
target_dir = Path(target_dir).resolve()
with zipfile.ZipFile(zip_path, "r") as zip_ref:
for member in zip_ref.infolist():
target_path = (target_dir / member.filename).resolve()
if not str(target_path).startswith(str(target_dir)):
raise RuntimeError(f"Unsafe zip path: {member.filename}")
zip_ref.extractall(target_dir)
for manifest_url in RESOURCE_MANIFESTS:
name = manifest_url.rstrip("/").split("/")[-2]
manifest_path = Path(f"{name}-latest.json")
archive_path = Path(f"{name}.zip")
download_file(manifest_url, manifest_path)
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
expected_paths = [Path(path) for path in manifest.get("expected_paths", [])]
if not all(path.exists() for path in expected_paths):
download_file(manifest["archive_url"], archive_path)
extract_zip_safely(archive_path, manifest.get("extract_to", "."))
missing = [str(path) for path in expected_paths if not path.exists()]
if missing:
raise FileNotFoundError(f"Missing expected paths: {missing}")
print(f"{manifest['name']} ready.")Use local books
Load corpus files from the local ./datasets/books folder.
import os
books_dir = "./datasets/books"
if not os.path.isdir(books_dir):
raise FileNotFoundError(f"Books folder not found: {books_dir}")
available_book_files = sorted([
file_name for file_name in os.listdir(books_dir)
if file_name.endswith(".txt") and os.path.isfile(os.path.join(books_dir, file_name))
])
if not available_book_files:
raise FileNotFoundError(f"No .txt books found in {books_dir}")
print("Books found in ./datasets/books:")
for file_name in available_book_files:
print(f" - {file_name}")Imports
# Commented out IPython magic to ensure Python compatibility.
# Run this once for each fresh remote kernel.
# %pip install -q -U transformers peft accelerate sentencepiece matplotlib
# standard library
import json
import math
import os
import random
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# plotting
import matplotlib.pyplot as plt
# torch
import torch
from torch.utils.data import DataLoader, Dataset
# hugging face / peft
from huggingface_hub.utils import disable_progress_bars
from peft import LoraConfig, PeftModel, TaskType, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
disable_progress_bars()Set hyperparameters and options
Set here your hyperparameters (to be used later in the code), so that you can run and compare different experiments operating on these values.
Note
A better alternative would be to use command-line arguments to set hyperparameters and other options (see argparse Python package).
# LoRA fine-tuning setup sized for a TITAN Xp (12 GB)
seed = 42
set_seed(seed)
random.seed(seed)
# choose the GPU explicitly; set to 1 for this remote kernel
cuda_device_id = 1
# base LLM: small enough for fast LoRA experiments on 12 GB VRAM
base_model_name = "Qwen/Qwen2.5-0.5B-Instruct"
# corpus options
corpus_book_name = "gameofthrones.txt"
validation_fraction = 0.10
validation_split_mode = "contiguous_paragraphs" # contiguous_paragraphs or shuffled_paragraphs
split_seed = 42
min_paragraph_chars = 120
chars_per_sample = 900
chars_stride = 450
max_train_examples = 320
max_eval_examples = 64
# tokenization / optimization
max_seq_length = 256
per_device_train_batch_size = 1
per_device_eval_batch_size = 1
gradient_accumulation_steps = 8
max_optimization_steps = 100
learning_rate = 2e-4
weight_decay = 0.01
warmup_steps = 6
logging_steps = 5
eval_every_steps = 10
use_gradient_checkpointing = True
# LoRA adapter hyperparameters
lora_rank = 32
lora_alpha = 64
lora_dropout = 0.05
lora_target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"]
# generation defaults
generation_seed = (
"Continue the following passage in the same narrative style.\n\n"
"Joffrey said"
)
generation_max_new_tokens = 120
generation_temperature = 0.80
generation_top_p = 0.92
generation_top_k = 40
generation_repetition_penalty = 1.05
# output location kept separate from transformer_results
adapter_root_dir = os.path.abspath("LLM_finetuning")
adapter_run_name = f"qwen25_0p5b_lora_{os.path.splitext(corpus_book_name)[0]}"
adapter_output_dir = os.path.join(adapter_root_dir, adapter_run_name)
skip_training_if_adapter_exists = False
corpus_book_file = os.path.join(books_dir, corpus_book_name)
if corpus_book_name not in available_book_files:
raise FileNotFoundError(
f"Selected corpus file not found in ./datasets/books: {corpus_book_name}. "
f"Available files: {available_book_files}"
)
if torch.cuda.is_available():
gpu_count = torch.cuda.device_count()
if not 0 <= cuda_device_id < gpu_count:
raise ValueError(
f"Requested cuda_device_id={cuda_device_id}, but only {gpu_count} GPU(s) are visible in this kernel."
)
device = torch.device(f"cuda:{cuda_device_id}")
torch.cuda.set_device(device)
else:
device = torch.device("cpu")
model_dtype = torch.float16 if device.type == "cuda" else torch.float32
os.makedirs(adapter_root_dir, exist_ok=True)
print(f"Selected corpus: {corpus_book_name}")
print(f"Validation split: {validation_fraction:.0%} ({validation_split_mode})")
print(f"Selected device: {device}")
if device.type == "cuda":
free_bytes, total_bytes = torch.cuda.mem_get_info(device)
print(f"Selected GPU name: {torch.cuda.get_device_name(cuda_device_id)}")
print(f"Free / total GPU memory: {free_bytes / 2**30:.2f} / {total_bytes / 2**30:.2f} GB")
print(f"Base model: {base_model_name}")
print(f"LoRA rank / alpha: {lora_rank} / {lora_alpha}")
print(f"Max sequence length: {max_seq_length}")
print(f"LoRA outputs will be saved in: {adapter_output_dir}")
print(f"Skip training when adapter exists: {skip_training_if_adapter_exists}")Prepare the LLM and LoRA helpers
Use a compact Qwen model plus LoRA adapters so the experiment fits on a TITAN Xp with 12 GB of VRAM.
def read_book_paragraphs(file_path, min_chars=120):
with open(file_path, "r", encoding="utf-8") as handle:
raw_text = handle.read()
paragraphs = []
for block in raw_text.split("\n\n"):
cleaned = " ".join(block.split())
if len(cleaned) >= min_chars:
paragraphs.append(cleaned)
if len(paragraphs) < 8:
raise ValueError(
f"The selected book produced only {len(paragraphs)} usable paragraphs. Choose a larger corpus or reduce min_paragraph_chars."
)
return paragraphs
def build_lora_examples(paragraphs, sample_chars=900, stride_chars=450):
joined_text = "\n\n".join(paragraphs)
if len(joined_text) <= sample_chars:
windows = [joined_text]
else:
windows = []
for start_idx in range(0, len(joined_text) - sample_chars + 1, stride_chars):
windows.append(joined_text[start_idx:start_idx + sample_chars])
final_window = joined_text[-sample_chars:]
if windows[-1] != final_window:
windows.append(final_window)
examples = []
for window in windows:
cleaned = " ".join(window.split())
if len(cleaned) < 220:
continue
split_point = int(len(cleaned) * 0.60)
prefix = cleaned[:split_point].rsplit(" ", 1)[0].strip()
suffix = cleaned[len(prefix):].strip()
if len(prefix) < 100 or len(suffix) < 40:
continue
examples.append(
"Continue the following passage in the same narrative style.\n\n"
f"### Passage\n{prefix}\n\n"
f"### Continuation\n{suffix}"
)
if not examples:
raise ValueError("No training examples could be built from the selected corpus.")
return examples
class SupervisedTextDataset(Dataset):
def __init__(self, text_examples, tokenizer, max_length):
encodings = tokenizer(
text_examples,
truncation=True,
max_length=max_length,
padding="max_length",
return_tensors="pt"
)
labels = encodings["input_ids"].clone()
labels[encodings["attention_mask"] == 0] = -100
self.input_ids = encodings["input_ids"]
self.attention_mask = encodings["attention_mask"]
self.labels = labels
def __len__(self):
return self.input_ids.size(0)
def __getitem__(self, index):
return {
"input_ids": self.input_ids[index],
"attention_mask": self.attention_mask[index],
"labels": self.labels[index]
}
def load_tokenizer(model_name):
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
return tokenizer
def build_lora_model(model_name):
base_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=model_dtype)
base_model.config.use_cache = False
if use_gradient_checkpointing:
base_model.gradient_checkpointing_enable()
if hasattr(base_model, "enable_input_require_grads"):
base_model.enable_input_require_grads()
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=lora_rank,
lora_alpha=lora_alpha,
lora_dropout=lora_dropout,
target_modules=lora_target_modules,
bias="none"
)
model = get_peft_model(base_model, lora_config)
model.to(device)
model.train()
if hasattr(model, "print_trainable_parameters"):
model.print_trainable_parameters()
return model
def load_lora_model_for_inference(model_name, adapter_dir):
tokenizer = load_tokenizer(model_name)
base_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=model_dtype)
model = PeftModel.from_pretrained(base_model, adapter_dir)
model.to(device)
model.eval()
return tokenizer, modelCreate the LoRA fine-tuning dataset
Turn the selected book into prompt/completion examples for supervised causal fine-tuning.
paragraphs = read_book_paragraphs(corpus_book_file, min_chars=min_paragraph_chars)
if validation_split_mode == "contiguous_paragraphs":
split_idx = max(1, int(round(len(paragraphs) * (1 - validation_fraction))))
split_idx = min(split_idx, len(paragraphs) - 1)
train_paragraphs = paragraphs[:split_idx]
eval_paragraphs = paragraphs[split_idx:]
elif validation_split_mode == "shuffled_paragraphs":
shuffled_paragraphs = paragraphs.copy()
random.Random(split_seed).shuffle(shuffled_paragraphs)
split_idx = max(1, int(round(len(shuffled_paragraphs) * (1 - validation_fraction))))
split_idx = min(split_idx, len(shuffled_paragraphs) - 1)
train_paragraphs = shuffled_paragraphs[:split_idx]
eval_paragraphs = shuffled_paragraphs[split_idx:]
else:
raise ValueError("validation_split_mode must be either 'contiguous_paragraphs' or 'shuffled_paragraphs'.")
train_examples = build_lora_examples(
train_paragraphs,
sample_chars=chars_per_sample,
stride_chars=chars_stride
)[:max_train_examples]
eval_examples = build_lora_examples(
eval_paragraphs,
sample_chars=chars_per_sample,
stride_chars=chars_stride
)[:max_eval_examples]
tokenizer = load_tokenizer(base_model_name)
train_dataset = SupervisedTextDataset(train_examples, tokenizer, max_seq_length)
eval_dataset = SupervisedTextDataset(eval_examples, tokenizer, max_seq_length)
train_token_lengths = train_dataset.attention_mask.sum(dim=1).float()
eval_token_lengths = eval_dataset.attention_mask.sum(dim=1).float()
print(f"Paragraphs found: {len(paragraphs)}")
print(f"Train paragraphs: {len(train_paragraphs)} | Eval paragraphs: {len(eval_paragraphs)}")
print(f"Train examples: {len(train_examples)} | Eval examples: {len(eval_examples)}")
print(f"Average train tokens: {train_token_lengths.mean().item():.1f}")
print(f"Average eval tokens: {eval_token_lengths.mean().item():.1f}")
print(f"Tokenizer pad token: {tokenizer.pad_token!r} (id={tokenizer.pad_token_id})")
print("\nSample fine-tuning example:\n")
print(train_examples[0][:800])Training and generation utilities
Define evaluation, plotting, checkpoint-summary, and text-generation helpers for the LoRA exercise.
def loss_to_perplexity(loss_value):
if loss_value != loss_value:
return float("nan")
if loss_value > 20:
return float("inf")
return float(math.exp(loss_value))
def evaluate_loss(model, dataloader):
if len(dataloader) == 0:
return float("nan")
model.eval()
total_loss = 0.0
with torch.no_grad():
for batch in dataloader:
batch = {key: value.to(device) for key, value in batch.items()}
outputs = model(**batch)
total_loss += outputs.loss.item()
model.train()
return total_loss / len(dataloader)
def generate_with_model(
model,
tokenizer,
prompt,
max_new_tokens=120,
temperature=0.8,
top_p=0.92,
top_k=40,
repetition_penalty=1.05
):
model.eval()
encoded = tokenizer(prompt, return_tensors="pt")
encoded = {key: value.to(device) for key, value in encoded.items()}
with torch.no_grad():
generated_ids = model.generate(
**encoded,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=temperature,
top_p=top_p,
top_k=top_k,
repetition_penalty=repetition_penalty,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id
)
return tokenizer.decode(generated_ids[0], skip_special_tokens=True)
def plot_training_history(history):
if not history or not history.get("optimizer_steps"):
print("No new training history to plot.")
return
_, axes = plt.subplots(1, 2, figsize=(13, 4))
axes[0].plot(history["optimizer_steps"], history["train_loss"], marker="o", linewidth=2, label="Train loss")
if history.get("eval_steps"):
axes[0].plot(history["eval_steps"], history["eval_loss"], marker="o", linewidth=2, label="Validation loss")
axes[0].set_title("LoRA fine-tuning: loss")
axes[0].set_xlabel("Optimizer step")
axes[0].set_ylabel("Loss")
axes[0].grid(alpha=0.3)
axes[0].legend()
axes[1].plot(history["optimizer_steps"], history["train_perplexity"], marker="o", linewidth=2, label="Train perplexity")
if history.get("eval_steps"):
axes[1].plot(history["eval_steps"], history["eval_perplexity"], marker="o", linewidth=2, label="Validation perplexity")
axes[1].set_title("LoRA fine-tuning: perplexity")
axes[1].set_xlabel("Optimizer step")
axes[1].set_ylabel("Perplexity")
axes[1].grid(alpha=0.3)
axes[1].legend()
plt.tight_layout()
plt.show()
def load_training_summary(summary_path):
if not os.path.exists(summary_path):
return {}
with open(summary_path, "r", encoding="utf-8") as handle:
return json.load(handle)Fine-tune Qwen with LoRA
First run the next cell to define the LoRA training loop, then run the following cell to either train a new adapter or reload one from LLM_finetuning.
def fine_tune_with_lora(model, tokenizer, train_dataset, eval_dataset, output_dir):
os.makedirs(output_dir, exist_ok=True)
train_loader = DataLoader(
train_dataset,
batch_size=per_device_train_batch_size,
shuffle=True,
drop_last=False
)
eval_loader = DataLoader(
eval_dataset,
batch_size=per_device_eval_batch_size,
shuffle=False,
drop_last=False
)
optimizer = torch.optim.AdamW(
[parameter for parameter in model.parameters() if parameter.requires_grad],
lr=learning_rate,
weight_decay=weight_decay
)
grad_scaler_ctor = getattr(torch.amp, "GradScaler", None)
if grad_scaler_ctor is not None:
scaler = grad_scaler_ctor("cuda", enabled=device.type == "cuda")
else:
scaler = getattr(torch.cuda.amp, "GradScaler")(enabled=device.type == "cuda")
summary_path = os.path.join(output_dir, "training_summary.json")
history = {
"optimizer_steps": [],
"train_loss": [],
"train_perplexity": [],
"eval_steps": [],
"eval_loss": [],
"eval_perplexity": []
}
best_eval_loss = float("inf")
best_eval_perplexity = float("inf")
optimizer_step = 0
micro_step = 0
accumulated_loss = 0.0
while optimizer_step < max_optimization_steps:
for batch in train_loader:
micro_step += 1
batch = {key: value.to(device) for key, value in batch.items()}
if warmup_steps > 0 and optimizer_step < warmup_steps:
current_lr = learning_rate * (optimizer_step + 1) / warmup_steps
else:
current_lr = learning_rate
for param_group in optimizer.param_groups:
param_group["lr"] = current_lr
with torch.amp.autocast(device_type=device.type, enabled=device.type == "cuda", dtype=torch.float16):
outputs = model(**batch)
loss = outputs.loss / gradient_accumulation_steps
scaler.scale(loss).backward()
accumulated_loss += outputs.loss.item()
if micro_step % gradient_accumulation_steps != 0:
continue
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad(set_to_none=True)
optimizer_step += 1
step_train_loss = accumulated_loss / gradient_accumulation_steps
step_train_perplexity = loss_to_perplexity(step_train_loss)
accumulated_loss = 0.0
history["optimizer_steps"].append(optimizer_step)
history["train_loss"].append(step_train_loss)
history["train_perplexity"].append(step_train_perplexity)
should_log = optimizer_step == 1 or optimizer_step % logging_steps == 0
should_eval = optimizer_step % eval_every_steps == 0 or optimizer_step == max_optimization_steps
if should_eval:
eval_loss = evaluate_loss(model, eval_loader)
eval_perplexity = loss_to_perplexity(eval_loss)
history["eval_steps"].append(optimizer_step)
history["eval_loss"].append(eval_loss)
history["eval_perplexity"].append(eval_perplexity)
print(
f"[LoRA] step {optimizer_step:03d}/{max_optimization_steps} | "
f"train_loss: {step_train_loss:.4f} | train_ppl: {step_train_perplexity:.2f} | "
f"val_loss: {eval_loss:.4f} | val_ppl: {eval_perplexity:.2f} | "
f"lr: {current_lr:.6f}"
)
if eval_loss < best_eval_loss:
best_eval_loss = eval_loss
best_eval_perplexity = eval_perplexity
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
with open(summary_path, "w", encoding="utf-8") as handle:
json.dump(
{
"base_model_name": base_model_name,
"best_eval_loss": best_eval_loss,
"best_eval_perplexity": best_eval_perplexity,
"optimizer_steps": optimizer_step,
"max_seq_length": max_seq_length,
"cuda_device_id": cuda_device_id,
"corpus_book_name": corpus_book_name,
"adapter_output_dir": output_dir
},
handle,
indent=2
)
print(
f"[LoRA] Saved improved adapter to: {output_dir} "
f"(best_val_loss={best_eval_loss:.4f}, best_val_ppl={best_eval_perplexity:.2f})"
)
elif should_log:
print(
f"[LoRA] step {optimizer_step:03d}/{max_optimization_steps} | "
f"train_loss: {step_train_loss:.4f} | train_ppl: {step_train_perplexity:.2f} | "
f"lr: {current_lr:.6f}"
)
if optimizer_step >= max_optimization_steps:
break
if len(train_loader) == 0:
raise RuntimeError("Training dataloader is empty.")
if best_eval_loss == float("inf"):
best_eval_loss = evaluate_loss(model, eval_loader)
best_eval_perplexity = loss_to_perplexity(best_eval_loss)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
with open(summary_path, "w", encoding="utf-8") as handle:
json.dump(
{
"base_model_name": base_model_name,
"best_eval_loss": best_eval_loss,
"best_eval_perplexity": best_eval_perplexity,
"optimizer_steps": optimizer_step,
"max_seq_length": max_seq_length,
"cuda_device_id": cuda_device_id,
"corpus_book_name": corpus_book_name,
"adapter_output_dir": output_dir
},
handle,
indent=2
)
tokenizer_for_inference, best_model = load_lora_model_for_inference(base_model_name, output_dir)
return {
"model": best_model,
"tokenizer": tokenizer_for_inference,
"history": history,
"best_eval_loss": best_eval_loss,
"best_eval_perplexity": best_eval_perplexity,
"output_dir": output_dir,
"loaded_from_adapter": False
}
adapter_config_path = os.path.join(adapter_output_dir, "adapter_config.json")
training_summary_path = os.path.join(adapter_output_dir, "training_summary.json")
adapter_exists = os.path.exists(adapter_config_path)
print(f"Current working directory: {os.getcwd()}")
print(f"Adapter output directory: {adapter_output_dir}")
print(f"Adapter already exists: {adapter_exists}")
print(f"Skip training if adapter exists: {skip_training_if_adapter_exists}")
if skip_training_if_adapter_exists and adapter_exists:
tokenizer, fine_tuned_model = load_lora_model_for_inference(base_model_name, adapter_output_dir)
summary = load_training_summary(training_summary_path)
best_eval_loss = summary.get("best_eval_loss", float("nan"))
best_eval_perplexity = summary.get("best_eval_perplexity", loss_to_perplexity(best_eval_loss))
lora_results = {
"model": fine_tuned_model,
"tokenizer": tokenizer,
"history": {
"optimizer_steps": [],
"train_loss": [],
"train_perplexity": [],
"eval_steps": [],
"eval_loss": [],
"eval_perplexity": []
},
"best_eval_loss": best_eval_loss,
"best_eval_perplexity": best_eval_perplexity,
"output_dir": adapter_output_dir,
"loaded_from_adapter": True
}
print(f"Loaded saved LoRA adapter instead of retraining: {adapter_output_dir}")
else:
lora_model = build_lora_model(base_model_name)
lora_results = fine_tune_with_lora(
lora_model,
tokenizer,
train_dataset,
eval_dataset,
adapter_output_dir
)
tokenizer = lora_results["tokenizer"]
fine_tuned_model = lora_results["model"]
plot_training_history(lora_results["history"])
print(f"Best eval loss: {lora_results['best_eval_loss']:.4f}")
print(f"Best eval perplexity: {lora_results['best_eval_perplexity']:.2f}")
print(f"Adapter directory: {lora_results['output_dir']}")
print(f"Loaded from existing adapter: {lora_results['loaded_from_adapter']}")
if not lora_results["loaded_from_adapter"]:
preview_text = generate_with_model(
fine_tuned_model,
tokenizer,
generation_seed,
max_new_tokens=80,
temperature=generation_temperature,
top_p=generation_top_p,
top_k=generation_top_k,
repetition_penalty=generation_repetition_penalty
)
print("\n--- QUICK PREVIEW ---\n")
print(preview_text)
print("\n--- END PREVIEW ---")Generate text with the LoRA adapter
Run this cell after training, or any time later, to reload the saved adapter from LLM_finetuning and generate text on GPU 1.
adapter_config_path = os.path.join(adapter_output_dir, "adapter_config.json")
if "fine_tuned_model" not in globals() or fine_tuned_model is None:
if not os.path.exists(adapter_config_path):
raise RuntimeError(
f"No saved adapter found at {adapter_output_dir!r}. Run the LoRA fine-tuning cell first."
)
tokenizer, fine_tuned_model = load_lora_model_for_inference(base_model_name, adapter_output_dir)
sample_prompt = generation_seed
sample_text = generate_with_model(
fine_tuned_model,
tokenizer,
sample_prompt,
max_new_tokens=generation_max_new_tokens,
temperature=generation_temperature,
top_p=generation_top_p,
top_k=generation_top_k,
repetition_penalty=generation_repetition_penalty
)
print(f"Loaded adapter directory: {adapter_output_dir}")
print(f"Selected device: {device}")
print(f"Prompt:\n{sample_prompt}")
print("\n--- GENERATED TEXT START ---\n")
print(sample_text)
print("\n--- GENERATED TEXT END ---")