LLM fine-tuning with LoRA

from pathlib import Path
from urllib.request import Request, urlopen
import json
import shutil
import zipfile
 
RESOURCE_MANIFESTS = [
    "https://assets.deeplearningnotes.com/code-support-resources/datasets/books/latest.json",
]
 
def download_file(url, path):
    request = Request(url, headers={"User-Agent": "Mozilla/5.0"})
    with urlopen(request) as response, open(path, "wb") as file:
        shutil.copyfileobj(response, file)
 
def extract_zip_safely(zip_path, target_dir="."):
    target_dir = Path(target_dir).resolve()
 
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        for member in zip_ref.infolist():
            target_path = (target_dir / member.filename).resolve()
            if not str(target_path).startswith(str(target_dir)):
                raise RuntimeError(f"Unsafe zip path: {member.filename}")
 
        zip_ref.extractall(target_dir)
 
for manifest_url in RESOURCE_MANIFESTS:
    name = manifest_url.rstrip("/").split("/")[-2]
    manifest_path = Path(f"{name}-latest.json")
    archive_path = Path(f"{name}.zip")
 
    download_file(manifest_url, manifest_path)
    manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
 
    expected_paths = [Path(path) for path in manifest.get("expected_paths", [])]
 
    if not all(path.exists() for path in expected_paths):
        download_file(manifest["archive_url"], archive_path)
        extract_zip_safely(archive_path, manifest.get("extract_to", "."))
 
    missing = [str(path) for path in expected_paths if not path.exists()]
    if missing:
        raise FileNotFoundError(f"Missing expected paths: {missing}")
 
    print(f"{manifest['name']} ready.")

Use local books

Load corpus files from the local ./datasets/books folder.

import os
 
books_dir = "./datasets/books"
if not os.path.isdir(books_dir):
    raise FileNotFoundError(f"Books folder not found: {books_dir}")
 
available_book_files = sorted([
    file_name for file_name in os.listdir(books_dir)
    if file_name.endswith(".txt") and os.path.isfile(os.path.join(books_dir, file_name))
])
 
if not available_book_files:
    raise FileNotFoundError(f"No .txt books found in {books_dir}")
 
print("Books found in ./datasets/books:")
for file_name in available_book_files:
    print(f" - {file_name}")

Imports

# Commented out IPython magic to ensure Python compatibility.
# Run this once for each fresh remote kernel.
# %pip install -q -U transformers peft accelerate sentencepiece matplotlib
 
# standard library
import json
import math
import os
import random
 
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
# plotting
import matplotlib.pyplot as plt
 
# torch
import torch
from torch.utils.data import DataLoader, Dataset
 
# hugging face / peft
from huggingface_hub.utils import disable_progress_bars
from peft import LoraConfig, PeftModel, TaskType, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
 
disable_progress_bars()

Set hyperparameters and options

Set here your hyperparameters (to be used later in the code), so that you can run and compare different experiments operating on these values.

Note

A better alternative would be to use command-line arguments to set hyperparameters and other options (see argparse Python package).

# LoRA fine-tuning setup sized for a TITAN Xp (12 GB)
seed = 42
set_seed(seed)
random.seed(seed)
 
# choose the GPU explicitly; set to 1 for this remote kernel
cuda_device_id = 1
 
# base LLM: small enough for fast LoRA experiments on 12 GB VRAM
base_model_name = "Qwen/Qwen2.5-0.5B-Instruct"
 
# corpus options
corpus_book_name = "gameofthrones.txt"
validation_fraction = 0.10
validation_split_mode = "contiguous_paragraphs"  # contiguous_paragraphs or shuffled_paragraphs
split_seed = 42
min_paragraph_chars = 120
chars_per_sample = 900
chars_stride = 450
max_train_examples = 320
max_eval_examples = 64
 
# tokenization / optimization
max_seq_length = 256
per_device_train_batch_size = 1
per_device_eval_batch_size = 1
gradient_accumulation_steps = 8
max_optimization_steps = 100
learning_rate = 2e-4
weight_decay = 0.01
warmup_steps = 6
logging_steps = 5
eval_every_steps = 10
use_gradient_checkpointing = True
 
# LoRA adapter hyperparameters
lora_rank = 32
lora_alpha = 64
lora_dropout = 0.05
lora_target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"]
 
# generation defaults
generation_seed = (
    "Continue the following passage in the same narrative style.\n\n"
    "Joffrey said"
)
generation_max_new_tokens = 120
generation_temperature = 0.80
generation_top_p = 0.92
generation_top_k = 40
generation_repetition_penalty = 1.05
 
# output location kept separate from transformer_results
adapter_root_dir = os.path.abspath("LLM_finetuning")
adapter_run_name = f"qwen25_0p5b_lora_{os.path.splitext(corpus_book_name)[0]}"
adapter_output_dir = os.path.join(adapter_root_dir, adapter_run_name)
skip_training_if_adapter_exists = False
 
corpus_book_file = os.path.join(books_dir, corpus_book_name)
if corpus_book_name not in available_book_files:
    raise FileNotFoundError(
        f"Selected corpus file not found in ./datasets/books: {corpus_book_name}. "
        f"Available files: {available_book_files}"
    )
 
if torch.cuda.is_available():
    gpu_count = torch.cuda.device_count()
    if not 0 <= cuda_device_id < gpu_count:
        raise ValueError(
            f"Requested cuda_device_id={cuda_device_id}, but only {gpu_count} GPU(s) are visible in this kernel."
        )
    device = torch.device(f"cuda:{cuda_device_id}")
    torch.cuda.set_device(device)
else:
    device = torch.device("cpu")
 
model_dtype = torch.float16 if device.type == "cuda" else torch.float32
os.makedirs(adapter_root_dir, exist_ok=True)
 
print(f"Selected corpus: {corpus_book_name}")
print(f"Validation split: {validation_fraction:.0%} ({validation_split_mode})")
print(f"Selected device: {device}")
if device.type == "cuda":
    free_bytes, total_bytes = torch.cuda.mem_get_info(device)
    print(f"Selected GPU name: {torch.cuda.get_device_name(cuda_device_id)}")
    print(f"Free / total GPU memory: {free_bytes / 2**30:.2f} / {total_bytes / 2**30:.2f} GB")
print(f"Base model: {base_model_name}")
print(f"LoRA rank / alpha: {lora_rank} / {lora_alpha}")
print(f"Max sequence length: {max_seq_length}")
print(f"LoRA outputs will be saved in: {adapter_output_dir}")
print(f"Skip training when adapter exists: {skip_training_if_adapter_exists}")

Prepare the LLM and LoRA helpers

Use a compact Qwen model plus LoRA adapters so the experiment fits on a TITAN Xp with 12 GB of VRAM.

def read_book_paragraphs(file_path, min_chars=120):
    with open(file_path, "r", encoding="utf-8") as handle:
        raw_text = handle.read()
 
    paragraphs = []
    for block in raw_text.split("\n\n"):
        cleaned = " ".join(block.split())
        if len(cleaned) >= min_chars:
            paragraphs.append(cleaned)
 
    if len(paragraphs) < 8:
        raise ValueError(
            f"The selected book produced only {len(paragraphs)} usable paragraphs. Choose a larger corpus or reduce min_paragraph_chars."
        )
 
    return paragraphs
 
 
def build_lora_examples(paragraphs, sample_chars=900, stride_chars=450):
    joined_text = "\n\n".join(paragraphs)
    if len(joined_text) <= sample_chars:
        windows = [joined_text]
    else:
        windows = []
        for start_idx in range(0, len(joined_text) - sample_chars + 1, stride_chars):
            windows.append(joined_text[start_idx:start_idx + sample_chars])
        final_window = joined_text[-sample_chars:]
        if windows[-1] != final_window:
            windows.append(final_window)
 
    examples = []
    for window in windows:
        cleaned = " ".join(window.split())
        if len(cleaned) < 220:
            continue
 
        split_point = int(len(cleaned) * 0.60)
        prefix = cleaned[:split_point].rsplit(" ", 1)[0].strip()
        suffix = cleaned[len(prefix):].strip()
        if len(prefix) < 100 or len(suffix) < 40:
            continue
 
        examples.append(
            "Continue the following passage in the same narrative style.\n\n"
            f"### Passage\n{prefix}\n\n"
            f"### Continuation\n{suffix}"
        )
 
    if not examples:
        raise ValueError("No training examples could be built from the selected corpus.")
 
    return examples
 
 
class SupervisedTextDataset(Dataset):
    def __init__(self, text_examples, tokenizer, max_length):
        encodings = tokenizer(
            text_examples,
            truncation=True,
            max_length=max_length,
            padding="max_length",
            return_tensors="pt"
        )
        labels = encodings["input_ids"].clone()
        labels[encodings["attention_mask"] == 0] = -100
 
        self.input_ids = encodings["input_ids"]
        self.attention_mask = encodings["attention_mask"]
        self.labels = labels
 
    def __len__(self):
        return self.input_ids.size(0)
 
    def __getitem__(self, index):
        return {
            "input_ids": self.input_ids[index],
            "attention_mask": self.attention_mask[index],
            "labels": self.labels[index]
        }
 
 
def load_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    return tokenizer
 
 
def build_lora_model(model_name):
    base_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=model_dtype)
    base_model.config.use_cache = False
    if use_gradient_checkpointing:
        base_model.gradient_checkpointing_enable()
        if hasattr(base_model, "enable_input_require_grads"):
            base_model.enable_input_require_grads()
 
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=lora_rank,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        target_modules=lora_target_modules,
        bias="none"
    )
    model = get_peft_model(base_model, lora_config)
    model.to(device)
    model.train()
 
    if hasattr(model, "print_trainable_parameters"):
        model.print_trainable_parameters()
 
    return model
 
 
def load_lora_model_for_inference(model_name, adapter_dir):
    tokenizer = load_tokenizer(model_name)
    base_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=model_dtype)
    model = PeftModel.from_pretrained(base_model, adapter_dir)
    model.to(device)
    model.eval()
    return tokenizer, model

Create the LoRA fine-tuning dataset

Turn the selected book into prompt/completion examples for supervised causal fine-tuning.

paragraphs = read_book_paragraphs(corpus_book_file, min_chars=min_paragraph_chars)
 
if validation_split_mode == "contiguous_paragraphs":
    split_idx = max(1, int(round(len(paragraphs) * (1 - validation_fraction))))
    split_idx = min(split_idx, len(paragraphs) - 1)
    train_paragraphs = paragraphs[:split_idx]
    eval_paragraphs = paragraphs[split_idx:]
elif validation_split_mode == "shuffled_paragraphs":
    shuffled_paragraphs = paragraphs.copy()
    random.Random(split_seed).shuffle(shuffled_paragraphs)
    split_idx = max(1, int(round(len(shuffled_paragraphs) * (1 - validation_fraction))))
    split_idx = min(split_idx, len(shuffled_paragraphs) - 1)
    train_paragraphs = shuffled_paragraphs[:split_idx]
    eval_paragraphs = shuffled_paragraphs[split_idx:]
else:
    raise ValueError("validation_split_mode must be either 'contiguous_paragraphs' or 'shuffled_paragraphs'.")
 
train_examples = build_lora_examples(
    train_paragraphs,
    sample_chars=chars_per_sample,
    stride_chars=chars_stride
)[:max_train_examples]
eval_examples = build_lora_examples(
    eval_paragraphs,
    sample_chars=chars_per_sample,
    stride_chars=chars_stride
)[:max_eval_examples]
 
tokenizer = load_tokenizer(base_model_name)
train_dataset = SupervisedTextDataset(train_examples, tokenizer, max_seq_length)
eval_dataset = SupervisedTextDataset(eval_examples, tokenizer, max_seq_length)
 
train_token_lengths = train_dataset.attention_mask.sum(dim=1).float()
eval_token_lengths = eval_dataset.attention_mask.sum(dim=1).float()
 
print(f"Paragraphs found: {len(paragraphs)}")
print(f"Train paragraphs: {len(train_paragraphs)} | Eval paragraphs: {len(eval_paragraphs)}")
print(f"Train examples: {len(train_examples)} | Eval examples: {len(eval_examples)}")
print(f"Average train tokens: {train_token_lengths.mean().item():.1f}")
print(f"Average eval tokens: {eval_token_lengths.mean().item():.1f}")
print(f"Tokenizer pad token: {tokenizer.pad_token!r} (id={tokenizer.pad_token_id})")
print("\nSample fine-tuning example:\n")
print(train_examples[0][:800])

Training and generation utilities

Define evaluation, plotting, checkpoint-summary, and text-generation helpers for the LoRA exercise.

def loss_to_perplexity(loss_value):
    if loss_value != loss_value:
        return float("nan")
    if loss_value > 20:
        return float("inf")
    return float(math.exp(loss_value))
 
 
def evaluate_loss(model, dataloader):
    if len(dataloader) == 0:
        return float("nan")
 
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for batch in dataloader:
            batch = {key: value.to(device) for key, value in batch.items()}
            outputs = model(**batch)
            total_loss += outputs.loss.item()
 
    model.train()
    return total_loss / len(dataloader)
 
 
def generate_with_model(
    model,
    tokenizer,
    prompt,
    max_new_tokens=120,
    temperature=0.8,
    top_p=0.92,
    top_k=40,
    repetition_penalty=1.05
):
    model.eval()
    encoded = tokenizer(prompt, return_tensors="pt")
    encoded = {key: value.to(device) for key, value in encoded.items()}
 
    with torch.no_grad():
        generated_ids = model.generate(
            **encoded,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            repetition_penalty=repetition_penalty,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
 
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)
 
 
def plot_training_history(history):
    if not history or not history.get("optimizer_steps"):
        print("No new training history to plot.")
        return
 
    _, axes = plt.subplots(1, 2, figsize=(13, 4))
    axes[0].plot(history["optimizer_steps"], history["train_loss"], marker="o", linewidth=2, label="Train loss")
    if history.get("eval_steps"):
        axes[0].plot(history["eval_steps"], history["eval_loss"], marker="o", linewidth=2, label="Validation loss")
    axes[0].set_title("LoRA fine-tuning: loss")
    axes[0].set_xlabel("Optimizer step")
    axes[0].set_ylabel("Loss")
    axes[0].grid(alpha=0.3)
    axes[0].legend()
 
    axes[1].plot(history["optimizer_steps"], history["train_perplexity"], marker="o", linewidth=2, label="Train perplexity")
    if history.get("eval_steps"):
        axes[1].plot(history["eval_steps"], history["eval_perplexity"], marker="o", linewidth=2, label="Validation perplexity")
    axes[1].set_title("LoRA fine-tuning: perplexity")
    axes[1].set_xlabel("Optimizer step")
    axes[1].set_ylabel("Perplexity")
    axes[1].grid(alpha=0.3)
    axes[1].legend()
 
    plt.tight_layout()
    plt.show()
 
 
def load_training_summary(summary_path):
    if not os.path.exists(summary_path):
        return {}
 
    with open(summary_path, "r", encoding="utf-8") as handle:
        return json.load(handle)

Fine-tune Qwen with LoRA

First run the next cell to define the LoRA training loop, then run the following cell to either train a new adapter or reload one from LLM_finetuning.

def fine_tune_with_lora(model, tokenizer, train_dataset, eval_dataset, output_dir):
    os.makedirs(output_dir, exist_ok=True)
 
    train_loader = DataLoader(
        train_dataset,
        batch_size=per_device_train_batch_size,
        shuffle=True,
        drop_last=False
    )
    eval_loader = DataLoader(
        eval_dataset,
        batch_size=per_device_eval_batch_size,
        shuffle=False,
        drop_last=False
    )
 
    optimizer = torch.optim.AdamW(
        [parameter for parameter in model.parameters() if parameter.requires_grad],
        lr=learning_rate,
        weight_decay=weight_decay
    )
    grad_scaler_ctor = getattr(torch.amp, "GradScaler", None)
    if grad_scaler_ctor is not None:
        scaler = grad_scaler_ctor("cuda", enabled=device.type == "cuda")
    else:
        scaler = getattr(torch.cuda.amp, "GradScaler")(enabled=device.type == "cuda")
    summary_path = os.path.join(output_dir, "training_summary.json")
 
    history = {
        "optimizer_steps": [],
        "train_loss": [],
        "train_perplexity": [],
        "eval_steps": [],
        "eval_loss": [],
        "eval_perplexity": []
    }
    best_eval_loss = float("inf")
    best_eval_perplexity = float("inf")
    optimizer_step = 0
    micro_step = 0
    accumulated_loss = 0.0
 
    while optimizer_step < max_optimization_steps:
        for batch in train_loader:
            micro_step += 1
            batch = {key: value.to(device) for key, value in batch.items()}
 
            if warmup_steps > 0 and optimizer_step < warmup_steps:
                current_lr = learning_rate * (optimizer_step + 1) / warmup_steps
            else:
                current_lr = learning_rate
            for param_group in optimizer.param_groups:
                param_group["lr"] = current_lr
 
            with torch.amp.autocast(device_type=device.type, enabled=device.type == "cuda", dtype=torch.float16):
                outputs = model(**batch)
                loss = outputs.loss / gradient_accumulation_steps
 
            scaler.scale(loss).backward()
            accumulated_loss += outputs.loss.item()
 
            if micro_step % gradient_accumulation_steps != 0:
                continue
 
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
 
            optimizer_step += 1
            step_train_loss = accumulated_loss / gradient_accumulation_steps
            step_train_perplexity = loss_to_perplexity(step_train_loss)
            accumulated_loss = 0.0
 
            history["optimizer_steps"].append(optimizer_step)
            history["train_loss"].append(step_train_loss)
            history["train_perplexity"].append(step_train_perplexity)
 
            should_log = optimizer_step == 1 or optimizer_step % logging_steps == 0
            should_eval = optimizer_step % eval_every_steps == 0 or optimizer_step == max_optimization_steps
 
            if should_eval:
                eval_loss = evaluate_loss(model, eval_loader)
                eval_perplexity = loss_to_perplexity(eval_loss)
                history["eval_steps"].append(optimizer_step)
                history["eval_loss"].append(eval_loss)
                history["eval_perplexity"].append(eval_perplexity)
                print(
                    f"[LoRA] step {optimizer_step:03d}/{max_optimization_steps} | "
                    f"train_loss: {step_train_loss:.4f} | train_ppl: {step_train_perplexity:.2f} | "
                    f"val_loss: {eval_loss:.4f} | val_ppl: {eval_perplexity:.2f} | "
                    f"lr: {current_lr:.6f}"
                )
 
                if eval_loss < best_eval_loss:
                    best_eval_loss = eval_loss
                    best_eval_perplexity = eval_perplexity
                    model.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
                    with open(summary_path, "w", encoding="utf-8") as handle:
                        json.dump(
                            {
                                "base_model_name": base_model_name,
                                "best_eval_loss": best_eval_loss,
                                "best_eval_perplexity": best_eval_perplexity,
                                "optimizer_steps": optimizer_step,
                                "max_seq_length": max_seq_length,
                                "cuda_device_id": cuda_device_id,
                                "corpus_book_name": corpus_book_name,
                                "adapter_output_dir": output_dir
                            },
                            handle,
                            indent=2
                        )
                    print(
                        f"[LoRA] Saved improved adapter to: {output_dir} "
                        f"(best_val_loss={best_eval_loss:.4f}, best_val_ppl={best_eval_perplexity:.2f})"
                    )
            elif should_log:
                print(
                    f"[LoRA] step {optimizer_step:03d}/{max_optimization_steps} | "
                    f"train_loss: {step_train_loss:.4f} | train_ppl: {step_train_perplexity:.2f} | "
                    f"lr: {current_lr:.6f}"
                )
 
            if optimizer_step >= max_optimization_steps:
                break
 
        if len(train_loader) == 0:
            raise RuntimeError("Training dataloader is empty.")
 
    if best_eval_loss == float("inf"):
        best_eval_loss = evaluate_loss(model, eval_loader)
        best_eval_perplexity = loss_to_perplexity(best_eval_loss)
        model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
        with open(summary_path, "w", encoding="utf-8") as handle:
            json.dump(
                {
                    "base_model_name": base_model_name,
                    "best_eval_loss": best_eval_loss,
                    "best_eval_perplexity": best_eval_perplexity,
                    "optimizer_steps": optimizer_step,
                    "max_seq_length": max_seq_length,
                    "cuda_device_id": cuda_device_id,
                    "corpus_book_name": corpus_book_name,
                    "adapter_output_dir": output_dir
                },
                handle,
                indent=2
            )
 
    tokenizer_for_inference, best_model = load_lora_model_for_inference(base_model_name, output_dir)
    return {
        "model": best_model,
        "tokenizer": tokenizer_for_inference,
        "history": history,
        "best_eval_loss": best_eval_loss,
        "best_eval_perplexity": best_eval_perplexity,
        "output_dir": output_dir,
        "loaded_from_adapter": False
    }
 
adapter_config_path = os.path.join(adapter_output_dir, "adapter_config.json")
training_summary_path = os.path.join(adapter_output_dir, "training_summary.json")
adapter_exists = os.path.exists(adapter_config_path)
 
print(f"Current working directory: {os.getcwd()}")
print(f"Adapter output directory: {adapter_output_dir}")
print(f"Adapter already exists: {adapter_exists}")
print(f"Skip training if adapter exists: {skip_training_if_adapter_exists}")
 
if skip_training_if_adapter_exists and adapter_exists:
    tokenizer, fine_tuned_model = load_lora_model_for_inference(base_model_name, adapter_output_dir)
    summary = load_training_summary(training_summary_path)
    best_eval_loss = summary.get("best_eval_loss", float("nan"))
    best_eval_perplexity = summary.get("best_eval_perplexity", loss_to_perplexity(best_eval_loss))
    lora_results = {
        "model": fine_tuned_model,
        "tokenizer": tokenizer,
        "history": {
            "optimizer_steps": [],
            "train_loss": [],
            "train_perplexity": [],
            "eval_steps": [],
            "eval_loss": [],
            "eval_perplexity": []
        },
        "best_eval_loss": best_eval_loss,
        "best_eval_perplexity": best_eval_perplexity,
        "output_dir": adapter_output_dir,
        "loaded_from_adapter": True
    }
    print(f"Loaded saved LoRA adapter instead of retraining: {adapter_output_dir}")
else:
    lora_model = build_lora_model(base_model_name)
    lora_results = fine_tune_with_lora(
        lora_model,
        tokenizer,
        train_dataset,
        eval_dataset,
        adapter_output_dir
    )
    tokenizer = lora_results["tokenizer"]
    fine_tuned_model = lora_results["model"]
 
plot_training_history(lora_results["history"])
 
print(f"Best eval loss: {lora_results['best_eval_loss']:.4f}")
print(f"Best eval perplexity: {lora_results['best_eval_perplexity']:.2f}")
print(f"Adapter directory: {lora_results['output_dir']}")
print(f"Loaded from existing adapter: {lora_results['loaded_from_adapter']}")
 
if not lora_results["loaded_from_adapter"]:
    preview_text = generate_with_model(
        fine_tuned_model,
        tokenizer,
        generation_seed,
        max_new_tokens=80,
        temperature=generation_temperature,
        top_p=generation_top_p,
        top_k=generation_top_k,
        repetition_penalty=generation_repetition_penalty
    )
    print("\n--- QUICK PREVIEW ---\n")
    print(preview_text)
    print("\n--- END PREVIEW ---")

Generate text with the LoRA adapter

Run this cell after training, or any time later, to reload the saved adapter from LLM_finetuning and generate text on GPU 1.

adapter_config_path = os.path.join(adapter_output_dir, "adapter_config.json")
 
if "fine_tuned_model" not in globals() or fine_tuned_model is None:
    if not os.path.exists(adapter_config_path):
        raise RuntimeError(
            f"No saved adapter found at {adapter_output_dir!r}. Run the LoRA fine-tuning cell first."
        )
    tokenizer, fine_tuned_model = load_lora_model_for_inference(base_model_name, adapter_output_dir)
 
sample_prompt = generation_seed
sample_text = generate_with_model(
    fine_tuned_model,
    tokenizer,
    sample_prompt,
    max_new_tokens=generation_max_new_tokens,
    temperature=generation_temperature,
    top_p=generation_top_p,
    top_k=generation_top_k,
    repetition_penalty=generation_repetition_penalty
)
 
print(f"Loaded adapter directory: {adapter_output_dir}")
print(f"Selected device: {device}")
print(f"Prompt:\n{sample_prompt}")
print("\n--- GENERATED TEXT START ---\n")
print(sample_text)
print("\n--- GENERATED TEXT END ---")

Deep Learning: Zero to Hero

Explorer

LLM fine-tuning with LoRA

Use local books

Imports

Set hyperparameters and options

Prepare the LLM and LoRA helpers

Create the LoRA fine-tuning dataset

Training and generation utilities

Fine-tune Qwen with LoRA

Generate text with the LoRA adapter

Graph View

Table of Contents