Skip to content

Latest commit

 

History

History
237 lines (188 loc) · 8.05 KB

Chapter 9: Design Patterns for LLM Regularization.md

File metadata and controls

237 lines (188 loc) · 8.05 KB

9 Design Patterns for LLM Regularization

9.1 Fundamentals of Regularization in LLMs

import torch
from torch import nn
from transformers import GPT2LMHeadModel, GPT2Config

def create_lm_model(vocab_size=50257, n_positions=1024, n_embd=768, n_layer=12, n_head=12):
    config = GPT2Config(
        vocab_size=vocab_size,
        n_positions=n_positions,
        n_embd=n_embd,
        n_layer=n_layer,
        n_head=n_head
    )
    model = GPT2LMHeadModel(config)
    return model

model = create_lm_model()
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

9.2 Weight Decay and L2 Regularization for Large Language Models

from torch.optim import AdamW

def train_with_weight_decay(model, train_dataloader, weight_decay=0.01, lr=5e-5, epochs=3):
    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_dataloader:
            optimizer.zero_grad()
            outputs = model(batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_dataloader):.4f}")

# Assuming you have a train_dataloader
# train_with_weight_decay(model, train_dataloader)

9.3 Dropout Techniques in LLM Architectures

class TransformerWithDropout(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = nn.Embedding(1000, d_model)  # Simplified positional encoding
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward=4*d_model, dropout=dropout),
            num_layers
        )
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.embedding(x) + self.pos_encoder(torch.arange(x.size(1), device=x.device))
        x = self.dropout(x)
        x = x.transpose(0, 1)
        x = self.transformer(x)
        x = x.transpose(0, 1)
        return self.fc_out(x)

model = TransformerWithDropout(vocab_size=50257, d_model=768, nhead=12, num_layers=12, dropout=0.1)
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

9.4 Layer-wise Adaptive Regularization Strategies

class LayerwiseAdaptiveRegularization(nn.Module):
    def __init__(self, base_model, num_layers, base_dropout=0.1, dropout_increase_per_layer=0.02):
        super().__init__()
        self.base_model = base_model
        self.num_layers = num_layers
        self.base_dropout = base_dropout
        self.dropout_increase_per_layer = dropout_increase_per_layer
        self.set_layerwise_dropout()

    def set_layerwise_dropout(self):
        for i, layer in enumerate(self.base_model.transformer.h):
            dropout = self.base_dropout + i * self.dropout_increase_per_layer
            layer.attn.dropout.p = dropout
            layer.mlp.dropout.p = dropout

    def forward(self, *args, **kwargs):
        return self.base_model(*args, **kwargs)

base_model = create_lm_model()
model = LayerwiseAdaptiveRegularization(base_model, num_layers=12)

9.5 Gradient Clipping and Noise Injection for LLM Stability

import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_

def train_with_grad_clip_and_noise(model, train_dataloader, grad_clip=1.0, noise_factor=0.01, lr=5e-5, epochs=3):
    optimizer = AdamW(model.parameters(), lr=lr)
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_dataloader:
            optimizer.zero_grad()
            
            # Add noise to inputs
            input_ids = batch['input_ids']
            noise = torch.randn_like(input_ids, dtype=torch.float) * noise_factor
            noisy_inputs = input_ids.float() + noise
            noisy_inputs = noisy_inputs.long().clamp(min=0, max=model.config.vocab_size - 1)
            
            outputs = model(input_ids=noisy_inputs, labels=input_ids)
            loss = outputs.loss
            loss.backward()
            
            clip_grad_norm_(model.parameters(), grad_clip)
            optimizer.step()
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_dataloader):.4f}")

# Assuming you have a train_dataloader
# train_with_grad_clip_and_noise(model, train_dataloader)

9.6 Combining Regularization Methods: Synergies and Trade-offs

from torch.nn.utils import clip_grad_norm_

def train_with_combined_regularization(model, train_dataloader, weight_decay=0.01, dropout=0.1, 
                                       grad_clip=1.0, lr=5e-5, epochs=3):
    model.train()
    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_dataloader:
            optimizer.zero_grad()
            outputs = model(batch)
            loss = outputs.loss
            loss.backward()
            clip_grad_norm_(model.parameters(), grad_clip)
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_dataloader):.4f}")

# Assuming you have a train_dataloader
# train_with_combined_regularization(model, train_dataloader)

9.7 Regularization in Transfer Learning and Fine-tuning Scenarios

from transformers import GPT2LMHeadModel, GPT2Tokenizer

def fine_tune_with_adaptive_regularization(pretrained_model_name, train_dataloader, 
                                           initial_dropout=0.1, epochs=3):
    model = GPT2LMHeadModel.from_pretrained(pretrained_model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name)
    
    optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        current_dropout = initial_dropout * (1 - epoch / epochs)
        
        for module in model.modules():
            if isinstance(module, nn.Dropout):
                module.p = current_dropout
        
        for batch in train_dataloader:
            optimizer.zero_grad()
            outputs = model(batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_dataloader):.4f}, Dropout: {current_dropout:.4f}")

# Assuming you have a train_dataloader
# fine_tune_with_adaptive_regularization('gpt2', train_dataloader)

9.8 Emerging Regularization Techniques for Next-generation LLMs

class MixOut(nn.Module):
    def __init__(self, p=0.5):
        super().__init__()
        self.p = p

    def forward(self, x):
        if not self.training:
            return x
        
        batch_size = x.size(0)
        perm = torch.randperm(batch_size).to(x.device)
        mixed = self.p * x + (1 - self.p) * x[perm]
        return mixed

class TransformerWithMixOut(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, mixout_prob=0.5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = nn.Embedding(1000, d_model)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward=4*d_model),
            num_layers
        )
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.mixout = MixOut(p=mixout_prob)

    def forward(self, x):
        x = self.embedding(x) + self.pos_encoder(torch.arange(x.size(1), device=x.device))
        x = self.mixout(x)
        x = x.transpose(0, 1)
        x = self.transformer(x)
        x = x.transpose(0, 1)
        return self.fc_out(x)

model = TransformerWithMixOut(vocab_size=50257, d_model=768, nhead=12, num_layers=12, mixout_prob=0.5)
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")