Source code for fit.nn.modules.normalization

import numpy as np

from fit.core.tensor import Tensor
from fit.nn.modules.base import Layer



[docs]
class BatchNorm(Layer):

[docs]
    def __init__(self, num_features, eps=1e-5, momentum=0.1):
        super().__init__()

        # Learnable parameters
        self.gamma = Tensor(np.ones(num_features), requires_grad=True)
        self.beta = Tensor(np.zeros(num_features), requires_grad=True)

        # Add parameters to be tracked
        self.add_parameter(self.gamma)
        self.add_parameter(self.beta)

        # Running statistics for inference
        self.running_mean = np.zeros(num_features)
        self.running_var = np.ones(num_features)

        # Hyperparameters
        self.eps = eps
        self.momentum = momentum
        self.training = True

        # Cache for backward pass
        self.cache = None



[docs]
    def forward(self, x: Tensor):
        # Reshape for proper broadcasting
        x_data = x.data
        if x_data.ndim == 2:
            # Handle batch of vectors
            x_reshaped = x_data
            reduction_axes = 0
        else:
            # For future support of CNN
            x_reshaped = x_data.reshape(x_data.shape[0], -1)
            reduction_axes = 0

        if self.training:
            # Calculate batch statistics
            batch_mean = np.mean(x_reshaped, axis=reduction_axes, keepdims=True)
            batch_var = np.var(x_reshaped, axis=reduction_axes, keepdims=True)

            # Update running statistics
            self.running_mean = (
                self.momentum * batch_mean.squeeze()
                + (1 - self.momentum) * self.running_mean
            )
            self.running_var = (
                self.momentum * batch_var.squeeze()
                + (1 - self.momentum) * self.running_var
            )

            # Normalize
            x_norm = (x_reshaped - batch_mean) / np.sqrt(batch_var + self.eps)

            # Cache for backward pass
            self.cache = (x_reshaped, batch_mean, batch_var, x_norm)
        else:
            # Use running statistics during inference
            x_norm = (x_reshaped - self.running_mean) / np.sqrt(
                self.running_var + self.eps
            )

        # Scale and shift
        out_data = self.gamma.data * x_norm + self.beta.data

        # Reshape back to original shape if needed
        if x_data.ndim != 2:
            out_data = out_data.reshape(x_data.shape)

        out = Tensor(out_data, requires_grad=x.requires_grad)

        def _backward():
            if not x.requires_grad:
                return

            # Get cached values
            x_reshaped, batch_mean, batch_var, x_norm = self.cache

            # Get batch size
            N = x_reshaped.shape[0]

            # Gradient with respect to gamma and beta
            if self.gamma.requires_grad:
                self.gamma.grad = np.sum(out.grad * x_norm, axis=0)

            if self.beta.requires_grad:
                self.beta.grad = np.sum(out.grad, axis=0)

            # Gradient with respect to x
            std_inv = 1.0 / np.sqrt(batch_var + self.eps)

            # Step 1: Gradient through scale and shift
            dx_norm = out.grad * self.gamma.data

            # Step 2: Gradient through normalization
            dx_var = (
                -0.5 * np.sum(dx_norm * (x_reshaped - batch_mean), axis=0) * std_inv**3
            )
            dx_mean = -np.sum(dx_norm * std_inv, axis=0) - 2.0 * dx_var * np.mean(
                x_reshaped - batch_mean, axis=0
            )

            dx = (
                dx_norm * std_inv
                + dx_var * 2.0 * (x_reshaped - batch_mean) / N
                + dx_mean / N
            )

            # Reshape gradient back to original shape if needed
            if x.data.ndim != 2:
                dx = dx.reshape(x.data.shape)

            x.grad = dx if x.grad is None else x.grad + dx

        out._backward = _backward
        out._prev = {x, self.gamma, self.beta}
        return out



[docs]
    def train(self):
        self.training = True



[docs]
    def eval(self):
        self.training = False



[docs]
    def get_config(self):
        return {
            "num_features": len(self.gamma.data),
            "eps": self.eps,
            "momentum": self.momentum,
        }





[docs]
class LayerNorm(Layer):
    """
    Layer Normalization: normalizes inputs across the feature dimension.

    Unlike BatchNorm, LayerNorm normalizes across features for each sample independently.
    """


[docs]
    def __init__(self, normalized_shape, eps=1e-5):
        """
        Initialize layer normalization.

        Args:
            normalized_shape: Input shape from an expected input of size
            eps: Small constant for numerical stability
        """
        super().__init__()

        if isinstance(normalized_shape, int):
            normalized_shape = (normalized_shape,)

        self.normalized_shape = normalized_shape
        self.eps = eps

        # Learnable parameters
        self.weight = Tensor(np.ones(normalized_shape), requires_grad=True)
        self.bias = Tensor(np.zeros(normalized_shape), requires_grad=True)

        # Add parameters to be tracked
        self.add_parameter(self.weight)
        self.add_parameter(self.bias)



[docs]
    def forward(self, x: Tensor) -> Tensor:
        """
        Apply layer normalization.

        Args:
            x: Input tensor

        Returns:
            Normalized tensor
        """
        # Calculate mean and variance across the feature dimensions
        # For most cases, this is the last dimension(s)
        axes_to_normalize = tuple(range(-len(self.normalized_shape), 0))

        mean = np.mean(x.data, axis=axes_to_normalize, keepdims=True)
        var = np.var(x.data, axis=axes_to_normalize, keepdims=True)

        # Normalize
        normalized = (x.data - mean) / np.sqrt(var + self.eps)

        # Scale and shift
        output_data = self.weight.data * normalized + self.bias.data

        # Create output tensor
        output = Tensor(output_data, requires_grad=x.requires_grad)

        # Define backward pass (simplified for now)
        def _backward():
            if output.grad is None or not x.requires_grad:
                return

            # For simplicity, just pass through the gradient
            # A full implementation would compute proper LayerNorm gradients
            x.grad = output.grad if x.grad is None else x.grad + output.grad

            # Update weight and bias gradients
            if self.weight.requires_grad:
                weight_grad = np.sum(
                    output.grad * normalized,
                    axis=tuple(range(x.data.ndim - len(self.normalized_shape))),
                )
                self.weight.grad = (
                    weight_grad
                    if self.weight.grad is None
                    else self.weight.grad + weight_grad
                )

            if self.bias.requires_grad:
                bias_grad = np.sum(
                    output.grad,
                    axis=tuple(range(x.data.ndim - len(self.normalized_shape))),
                )
                self.bias.grad = (
                    bias_grad if self.bias.grad is None else self.bias.grad + bias_grad
                )

        output._backward = _backward
        output._prev = {x, self.weight, self.bias}

        return output