9 Design Patterns for LLM Regularization
9.1 Fundamentals of Regularization in LLMs
import torch
from torch import nn
from transformers import GPT2LMHeadModel , GPT2Config
def create_lm_model (vocab_size = 50257 , n_positions = 1024 , n_embd = 768 , n_layer = 12 , n_head = 12 ):
config = GPT2Config (
vocab_size = vocab_size ,
n_positions = n_positions ,
n_embd = n_embd ,
n_layer = n_layer ,
n_head = n_head
)
model = GPT2LMHeadModel (config )
return model
model = create_lm_model ()
print (f"Model parameters: { sum (p .numel () for p in model .parameters ()):,} " )
9.2 Weight Decay and L2 Regularization for Large Language Models
from torch .optim import AdamW
def train_with_weight_decay (model , train_dataloader , weight_decay = 0.01 , lr = 5e-5 , epochs = 3 ):
optimizer = AdamW (model .parameters (), lr = lr , weight_decay = weight_decay )
for epoch in range (epochs ):
model .train ()
total_loss = 0
for batch in train_dataloader :
optimizer .zero_grad ()
outputs = model (batch )
loss = outputs .loss
loss .backward ()
optimizer .step ()
total_loss += loss .item ()
print (f"Epoch { epoch + 1 } , Loss: { total_loss / len (train_dataloader ):.4f} " )
# Assuming you have a train_dataloader
# train_with_weight_decay(model, train_dataloader)
9.3 Dropout Techniques in LLM Architectures
class TransformerWithDropout (nn .Module ):
def __init__ (self , vocab_size , d_model , nhead , num_layers , dropout = 0.1 ):
super ().__init__ ()
self .embedding = nn .Embedding (vocab_size , d_model )
self .pos_encoder = nn .Embedding (1000 , d_model ) # Simplified positional encoding
self .transformer = nn .TransformerEncoder (
nn .TransformerEncoderLayer (d_model , nhead , dim_feedforward = 4 * d_model , dropout = dropout ),
num_layers
)
self .fc_out = nn .Linear (d_model , vocab_size )
self .dropout = nn .Dropout (dropout )
def forward (self , x ):
x = self .embedding (x ) + self .pos_encoder (torch .arange (x .size (1 ), device = x .device ))
x = self .dropout (x )
x = x .transpose (0 , 1 )
x = self .transformer (x )
x = x .transpose (0 , 1 )
return self .fc_out (x )
model = TransformerWithDropout (vocab_size = 50257 , d_model = 768 , nhead = 12 , num_layers = 12 , dropout = 0.1 )
print (f"Model parameters: { sum (p .numel () for p in model .parameters ()):,} " )
9.4 Layer-wise Adaptive Regularization Strategies
class LayerwiseAdaptiveRegularization (nn .Module ):
def __init__ (self , base_model , num_layers , base_dropout = 0.1 , dropout_increase_per_layer = 0.02 ):
super ().__init__ ()
self .base_model = base_model
self .num_layers = num_layers
self .base_dropout = base_dropout
self .dropout_increase_per_layer = dropout_increase_per_layer
self .set_layerwise_dropout ()
def set_layerwise_dropout (self ):
for i , layer in enumerate (self .base_model .transformer .h ):
dropout = self .base_dropout + i * self .dropout_increase_per_layer
layer .attn .dropout .p = dropout
layer .mlp .dropout .p = dropout
def forward (self , * args , ** kwargs ):
return self .base_model (* args , ** kwargs )
base_model = create_lm_model ()
model = LayerwiseAdaptiveRegularization (base_model , num_layers = 12 )
9.5 Gradient Clipping and Noise Injection for LLM Stability
import torch .nn .functional as F
from torch .nn .utils import clip_grad_norm_
def train_with_grad_clip_and_noise (model , train_dataloader , grad_clip = 1.0 , noise_factor = 0.01 , lr = 5e-5 , epochs = 3 ):
optimizer = AdamW (model .parameters (), lr = lr )
for epoch in range (epochs ):
model .train ()
total_loss = 0
for batch in train_dataloader :
optimizer .zero_grad ()
# Add noise to inputs
input_ids = batch ['input_ids' ]
noise = torch .randn_like (input_ids , dtype = torch .float ) * noise_factor
noisy_inputs = input_ids .float () + noise
noisy_inputs = noisy_inputs .long ().clamp (min = 0 , max = model .config .vocab_size - 1 )
outputs = model (input_ids = noisy_inputs , labels = input_ids )
loss = outputs .loss
loss .backward ()
clip_grad_norm_ (model .parameters (), grad_clip )
optimizer .step ()
total_loss += loss .item ()
print (f"Epoch { epoch + 1 } , Loss: { total_loss / len (train_dataloader ):.4f} " )
# Assuming you have a train_dataloader
# train_with_grad_clip_and_noise(model, train_dataloader)
9.6 Combining Regularization Methods: Synergies and Trade-offs
from torch .nn .utils import clip_grad_norm_
def train_with_combined_regularization (model , train_dataloader , weight_decay = 0.01 , dropout = 0.1 ,
grad_clip = 1.0 , lr = 5e-5 , epochs = 3 ):
model .train ()
optimizer = AdamW (model .parameters (), lr = lr , weight_decay = weight_decay )
for epoch in range (epochs ):
total_loss = 0
for batch in train_dataloader :
optimizer .zero_grad ()
outputs = model (batch )
loss = outputs .loss
loss .backward ()
clip_grad_norm_ (model .parameters (), grad_clip )
optimizer .step ()
total_loss += loss .item ()
print (f"Epoch { epoch + 1 } , Loss: { total_loss / len (train_dataloader ):.4f} " )
# Assuming you have a train_dataloader
# train_with_combined_regularization(model, train_dataloader)
9.7 Regularization in Transfer Learning and Fine-tuning Scenarios
from transformers import GPT2LMHeadModel , GPT2Tokenizer
def fine_tune_with_adaptive_regularization (pretrained_model_name , train_dataloader ,
initial_dropout = 0.1 , epochs = 3 ):
model = GPT2LMHeadModel .from_pretrained (pretrained_model_name )
tokenizer = GPT2Tokenizer .from_pretrained (pretrained_model_name )
optimizer = AdamW (model .parameters (), lr = 5e-5 , weight_decay = 0.01 )
for epoch in range (epochs ):
model .train ()
total_loss = 0
current_dropout = initial_dropout * (1 - epoch / epochs )
for module in model .modules ():
if isinstance (module , nn .Dropout ):
module .p = current_dropout
for batch in train_dataloader :
optimizer .zero_grad ()
outputs = model (batch )
loss = outputs .loss
loss .backward ()
optimizer .step ()
total_loss += loss .item ()
print (f"Epoch { epoch + 1 } , Loss: { total_loss / len (train_dataloader ):.4f} , Dropout: { current_dropout :.4f} " )
# Assuming you have a train_dataloader
# fine_tune_with_adaptive_regularization('gpt2', train_dataloader)
9.8 Emerging Regularization Techniques for Next-generation LLMs
class MixOut (nn .Module ):
def __init__ (self , p = 0.5 ):
super ().__init__ ()
self .p = p
def forward (self , x ):
if not self .training :
return x
batch_size = x .size (0 )
perm = torch .randperm (batch_size ).to (x .device )
mixed = self .p * x + (1 - self .p ) * x [perm ]
return mixed
class TransformerWithMixOut (nn .Module ):
def __init__ (self , vocab_size , d_model , nhead , num_layers , mixout_prob = 0.5 ):
super ().__init__ ()
self .embedding = nn .Embedding (vocab_size , d_model )
self .pos_encoder = nn .Embedding (1000 , d_model )
self .transformer = nn .TransformerEncoder (
nn .TransformerEncoderLayer (d_model , nhead , dim_feedforward = 4 * d_model ),
num_layers
)
self .fc_out = nn .Linear (d_model , vocab_size )
self .mixout = MixOut (p = mixout_prob )
def forward (self , x ):
x = self .embedding (x ) + self .pos_encoder (torch .arange (x .size (1 ), device = x .device ))
x = self .mixout (x )
x = x .transpose (0 , 1 )
x = self .transformer (x )
x = x .transpose (0 , 1 )
return self .fc_out (x )
model = TransformerWithMixOut (vocab_size = 50257 , d_model = 768 , nhead = 12 , num_layers = 12 , mixout_prob = 0.5 )
print (f"Model parameters: { sum (p .numel () for p in model .parameters ()):,} " )