Source code for fit.optim.adam

"""
Adam optimizer implementation.
"""

import numpy as np
from typing import List
from fit.core.tensor import Tensor


[docs] class Adam: """ Adam optimizer with bias correction. Combines the advantages of AdaGrad and RMSProp. Maintains moving averages of both gradients and squared gradients. """
[docs] def __init__( self, parameters: List[Tensor], lr: float = 0.001, betas: tuple = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.0, ): """ Initialize Adam optimizer. Args: parameters: List of parameters to optimize lr: Learning rate betas: Coefficients for computing running averages (beta1, beta2) eps: Small constant for numerical stability weight_decay: L2 penalty coefficient """ self.parameters = parameters self.lr = lr self.beta1, self.beta2 = betas self.eps = eps self.weight_decay = weight_decay # Initialize state self.state = {} for i, param in enumerate(parameters): self.state[i] = { "m": np.zeros_like(param.data), # First moment "v": np.zeros_like(param.data), # Second moment "step": 0, }
[docs] def step(self): """Perform one optimization step.""" for i, param in enumerate(self.parameters): if param.grad is None: continue grad = param.grad state = self.state[i] # Add weight decay if self.weight_decay != 0: grad = grad + self.weight_decay * param.data # Update step count state["step"] += 1 # Update biased first moment estimate state["m"] = self.beta1 * state["m"] + (1 - self.beta1) * grad # Update biased second raw moment estimate state["v"] = self.beta2 * state["v"] + (1 - self.beta2) * grad * grad # Compute bias-corrected first moment estimate m_hat = state["m"] / (1 - self.beta1 ** state["step"]) # Compute bias-corrected second raw moment estimate v_hat = state["v"] / (1 - self.beta2 ** state["step"]) # Update parameters param.data = param.data - self.lr * m_hat / (np.sqrt(v_hat) + self.eps)
[docs] def zero_grad(self): """Zero all parameter gradients.""" for param in self.parameters: param.grad = None
[docs] def state_dict(self): """Return optimizer state.""" return { "lr": self.lr, "betas": (self.beta1, self.beta2), "eps": self.eps, "weight_decay": self.weight_decay, "state": self.state, }
[docs] def load_state_dict(self, state_dict): """Load optimizer state.""" self.lr = state_dict["lr"] self.beta1, self.beta2 = state_dict["betas"] self.eps = state_dict["eps"] self.weight_decay = state_dict["weight_decay"] self.state = state_dict["state"]
[docs] class AdamW: """ AdamW optimizer with decoupled weight decay. Implements the weight decay fix from "Decoupled Weight Decay Regularization". """
[docs] def __init__( self, parameters: List[Tensor], lr: float = 0.001, betas: tuple = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01, ): """ Initialize AdamW optimizer. Args: parameters: List of parameters to optimize lr: Learning rate betas: Coefficients for computing running averages (beta1, beta2) eps: Small constant for numerical stability weight_decay: Weight decay coefficient (decoupled) """ self.parameters = parameters self.lr = lr self.beta1, self.beta2 = betas self.eps = eps self.weight_decay = weight_decay # Initialize state self.state = {} for i, param in enumerate(parameters): self.state[i] = { "m": np.zeros_like(param.data), # First moment "v": np.zeros_like(param.data), # Second moment "step": 0, }
[docs] def step(self): """Perform one optimization step.""" for i, param in enumerate(self.parameters): if param.grad is None: continue grad = param.grad state = self.state[i] # Update step count state["step"] += 1 # Update biased first moment estimate state["m"] = self.beta1 * state["m"] + (1 - self.beta1) * grad # Update biased second raw moment estimate state["v"] = self.beta2 * state["v"] + (1 - self.beta2) * grad * grad # Compute bias-corrected first moment estimate m_hat = state["m"] / (1 - self.beta1 ** state["step"]) # Compute bias-corrected second raw moment estimate v_hat = state["v"] / (1 - self.beta2 ** state["step"]) # Update parameters with decoupled weight decay param.data = param.data - self.lr * ( m_hat / (np.sqrt(v_hat) + self.eps) + self.weight_decay * param.data )
[docs] def zero_grad(self): """Zero all parameter gradients.""" for param in self.parameters: param.grad = None
[docs] def state_dict(self): """Return optimizer state.""" return { "lr": self.lr, "betas": (self.beta1, self.beta2), "eps": self.eps, "weight_decay": self.weight_decay, "state": self.state, }
[docs] def load_state_dict(self, state_dict): """Load optimizer state.""" self.lr = state_dict["lr"] self.beta1, self.beta2 = state_dict["betas"] self.eps = state_dict["eps"] self.weight_decay = state_dict["weight_decay"] self.state = state_dict["state"]
[docs] class Adamax: """ Adamax optimizer (variant of Adam based on infinity norm). """
[docs] def __init__( self, parameters: List[Tensor], lr: float = 0.002, betas: tuple = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.0, ): """ Initialize Adamax optimizer. Args: parameters: List of parameters to optimize lr: Learning rate betas: Coefficients for computing running averages (beta1, beta2) eps: Small constant for numerical stability weight_decay: L2 penalty coefficient """ self.parameters = parameters self.lr = lr self.beta1, self.beta2 = betas self.eps = eps self.weight_decay = weight_decay # Initialize state self.state = {} for i, param in enumerate(parameters): self.state[i] = { "m": np.zeros_like(param.data), # First moment "u": np.zeros_like(param.data), # Infinity norm "step": 0, }
[docs] def step(self): """Perform one optimization step.""" for i, param in enumerate(self.parameters): if param.grad is None: continue grad = param.grad state = self.state[i] # Add weight decay if self.weight_decay != 0: grad = grad + self.weight_decay * param.data # Update step count state["step"] += 1 # Update biased first moment estimate state["m"] = self.beta1 * state["m"] + (1 - self.beta1) * grad # Update infinity norm state["u"] = np.maximum(self.beta2 * state["u"], np.abs(grad)) # Compute bias-corrected first moment estimate m_hat = state["m"] / (1 - self.beta1 ** state["step"]) # Update parameters param.data = param.data - self.lr * m_hat / (state["u"] + self.eps)
[docs] def zero_grad(self): """Zero all parameter gradients.""" for param in self.parameters: param.grad = None
[docs] class NAdam: """ NAdam optimizer (Nesterov-accelerated Adam). """
[docs] def __init__( self, parameters: List[Tensor], lr: float = 0.002, betas: tuple = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.0, ): """ Initialize NAdam optimizer. Args: parameters: List of parameters to optimize lr: Learning rate betas: Coefficients for computing running averages (beta1, beta2) eps: Small constant for numerical stability weight_decay: L2 penalty coefficient """ self.parameters = parameters self.lr = lr self.beta1, self.beta2 = betas self.eps = eps self.weight_decay = weight_decay # Initialize state self.state = {} for i, param in enumerate(parameters): self.state[i] = { "m": np.zeros_like(param.data), # First moment "v": np.zeros_like(param.data), # Second moment "step": 0, }
[docs] def step(self): """Perform one optimization step.""" for i, param in enumerate(self.parameters): if param.grad is None: continue grad = param.grad state = self.state[i] # Add weight decay if self.weight_decay != 0: grad = grad + self.weight_decay * param.data # Update step count state["step"] += 1 # Update biased first moment estimate state["m"] = self.beta1 * state["m"] + (1 - self.beta1) * grad # Update biased second raw moment estimate state["v"] = self.beta2 * state["v"] + (1 - self.beta2) * grad * grad # Compute bias-corrected first moment estimate m_hat = state["m"] / (1 - self.beta1 ** state["step"]) # Compute bias-corrected second raw moment estimate v_hat = state["v"] / (1 - self.beta2 ** state["step"]) # Nesterov acceleration m_nesterov = self.beta1 * m_hat + (1 - self.beta1) * grad / ( 1 - self.beta1 ** state["step"] ) # Update parameters param.data = param.data - self.lr * m_nesterov / (np.sqrt(v_hat) + self.eps)
[docs] def zero_grad(self): """Zero all parameter gradients.""" for param in self.parameters: param.grad = None