"""
Implementation of activation functions for neural networks.
"""
import numpy as np
from fit.core.tensor import Tensor
from fit.nn.modules.base import Layer
[docs]
class ReLU(Layer):
[docs]
def forward(self, x):
out = Tensor((x.data > 0) * x.data, requires_grad=x.requires_grad)
def _backward():
if x.requires_grad and out.grad is not None:
grad = (x.data > 0).astype(
float
) * out.grad # Multiply with upstream grad
x.grad = grad if x.grad is None else x.grad + grad
out._backward = _backward
out._prev = {x}
return out
[docs]
class Softmax(Layer):
[docs]
def forward(self, x: Tensor, axis=-1):
"""
Apply softmax function along specified axis.
Args:
x: Input tensor
axis: Axis along which to apply softmax
Returns:
Softmax output
"""
# Numerically stable softmax
x_max = Tensor(np.max(x.data, axis=axis, keepdims=True))
x_shifted = x - x_max
exp_x = x_shifted.exp()
sum_exp = Tensor(np.sum(exp_x.data, axis=axis, keepdims=True))
out = exp_x / sum_exp
def _backward():
if x.requires_grad and out.grad is not None:
# Softmax gradient: softmax * (grad - (softmax * grad).sum())
s = out.data
grad_sum = np.sum(out.grad * s, axis=axis, keepdims=True)
grad = s * (out.grad - grad_sum)
x.grad = grad if x.grad is None else x.grad + grad
out._backward = _backward
out._prev = {x}
return out
[docs]
class Tanh(Layer):
[docs]
def forward(self, x):
out = Tensor(np.tanh(x.data), requires_grad=x.requires_grad)
def _backward():
if x.requires_grad and out.grad is not None:
# Derivative of tanh is 1 - tanh^2
grad = out.grad * (1 - out.data * out.data)
x.grad = grad if x.grad is None else x.grad + grad
out._backward = _backward
out._prev = {x}
return out
[docs]
class Sigmoid(Layer):
[docs]
def forward(self, x):
# Numerically stable sigmoid
out_data = np.where(
x.data >= 0,
1 / (1 + np.exp(-x.data)),
np.exp(x.data) / (1 + np.exp(x.data)),
)
out = Tensor(out_data, requires_grad=x.requires_grad)
def _backward():
if x.requires_grad and out.grad is not None:
# Derivative of sigmoid is sigmoid * (1 - sigmoid)
grad = out.grad * out.data * (1 - out.data)
x.grad = grad if x.grad is None else x.grad + grad
out._backward = _backward
out._prev = {x}
return out
[docs]
class LeakyReLU(Layer):
[docs]
def __init__(self, negative_slope=0.01):
super().__init__()
self.negative_slope = negative_slope
[docs]
def forward(self, x):
out_data = np.where(x.data > 0, x.data, self.negative_slope * x.data)
out = Tensor(out_data, requires_grad=x.requires_grad)
def _backward():
if x.requires_grad and out.grad is not None:
grad = np.where(x.data > 0, 1.0, self.negative_slope) * out.grad
x.grad = grad if x.grad is None else x.grad + grad
out._backward = _backward
out._prev = {x}
return out
[docs]
class ELU(Layer):
[docs]
def __init__(self, alpha=1.0):
super().__init__()
self.alpha = alpha
[docs]
def forward(self, x):
out_data = np.where(x.data > 0, x.data, self.alpha * (np.exp(x.data) - 1))
out = Tensor(out_data, requires_grad=x.requires_grad)
def _backward():
if x.requires_grad and out.grad is not None:
grad = np.where(x.data > 0, 1.0, out.data + self.alpha) * out.grad
x.grad = grad if x.grad is None else x.grad + grad
out._backward = _backward
out._prev = {x}
return out
[docs]
class GELU(Layer):
[docs]
def forward(self, x):
# Gaussian Error Linear Unit: x * 0.5 * (1 + tanh(sqrt(2/π) * (x + 0.044715 * x^3)))
sqrt_2_over_pi = np.sqrt(2.0 / np.pi)
cdf = 0.5 * (1.0 + np.tanh(sqrt_2_over_pi * (x.data + 0.044715 * x.data**3)))
out_data = x.data * cdf
out = Tensor(out_data, requires_grad=x.requires_grad)
def _backward():
if x.requires_grad and out.grad is not None:
# Approximate GELU derivative
tanh_arg = sqrt_2_over_pi * (x.data + 0.044715 * x.data**3)
tanh_val = np.tanh(tanh_arg)
sech2 = 1 - tanh_val**2
grad = 0.5 * (1 + tanh_val) + x.data * 0.5 * sech2 * sqrt_2_over_pi * (
1 + 3 * 0.044715 * x.data**2
)
grad = grad * out.grad
x.grad = grad if x.grad is None else x.grad + grad
out._backward = _backward
out._prev = {x}
return out
[docs]
class Swish(Layer):
[docs]
def forward(self, x):
# Swish: x * sigmoid(x)
sigmoid_data = 1 / (1 + np.exp(-np.clip(x.data, -88, 88)))
out_data = x.data * sigmoid_data
out = Tensor(out_data, requires_grad=x.requires_grad)
def _backward():
if x.requires_grad and out.grad is not None:
# Derivative: sigmoid + x * sigmoid * (1 - sigmoid)
sigmoid_val = 1 / (1 + np.exp(-np.clip(x.data, -88, 88)))
grad = sigmoid_val + x.data * sigmoid_val * (1 - sigmoid_val)
grad = grad * out.grad
x.grad = grad if x.grad is None else x.grad + grad
out._backward = _backward
out._prev = {x}
return out
[docs]
class Dropout(Layer):
[docs]
def __init__(self, p=0.5):
super().__init__()
self.p = p
self.training = True
[docs]
def forward(self, x):
if not self.training or self.p == 0:
return x
# Create dropout mask
mask = np.random.binomial(1, 1 - self.p, x.data.shape) / (1 - self.p)
out_data = x.data * mask
out = Tensor(out_data, requires_grad=x.requires_grad)
def _backward():
if x.requires_grad and out.grad is not None:
grad = out.grad * mask
x.grad = grad if x.grad is None else x.grad + grad
out._backward = _backward
out._prev = {x}
return out
[docs]
def train(self):
self.training = True
[docs]
def eval(self):
self.training = False
[docs]
class LogSoftmax(Layer):
[docs]
def forward(self, x: Tensor, axis=-1):
"""
Apply log-softmax function along specified axis.
Args:
x: Input tensor
axis: Axis along which to apply log-softmax
Returns:
Log-softmax output
"""
# Numerically stable log-softmax: x - logsumexp(x)
x_max = Tensor(np.max(x.data, axis=axis, keepdims=True))
x_shifted = x - x_max
exp_x = x_shifted.exp()
sum_exp = Tensor(np.sum(exp_x.data, axis=axis, keepdims=True))
log_sum_exp = sum_exp.log() + x_max
out = x - log_sum_exp
def _backward():
if x.requires_grad and out.grad is not None:
# Log-softmax gradient: grad - softmax * grad.sum()
softmax = exp_x / sum_exp
grad_sum = np.sum(out.grad, axis=axis, keepdims=True)
grad = out.grad - softmax.data * grad_sum
x.grad = grad if x.grad is None else x.grad + grad
out._backward = _backward
out._prev = {x}
return out