Source code for fit.data.preprocessing

"""
Data preprocessing utilities for machine learning.

This module provides tools for encoding categorical variables,
scaling features, and other preprocessing tasks.
"""

import numpy as np
from typing import Dict, List, Optional, Union, Any
import warnings



[docs]
class LabelEncoder:
    """
    Encode target labels with value between 0 and n_classes-1.

    Examples:
        >>> encoder = LabelEncoder()
        >>> labels = ['cat', 'dog', 'cat', 'bird']
        >>> encoded = encoder.fit_transform(labels)
        >>> print(encoded)  # [0, 1, 0, 2]
        >>> decoded = encoder.inverse_transform(encoded)
        >>> print(decoded)  # ['cat', 'dog', 'cat', 'bird']
    """


[docs]
    def __init__(self):
        self.classes_ = None
        self.class_to_index = None



[docs]
    def fit(self, y: Union[List, np.ndarray]) -> "LabelEncoder":
        """
        Fit label encoder.

        Args:
            y: Target values

        Returns:
            Self for method chaining
        """
        y = np.asarray(y)
        self.classes_ = np.unique(y)
        self.class_to_index = {cls: idx for idx, cls in enumerate(self.classes_)}
        return self



[docs]
    def transform(self, y: Union[List, np.ndarray]) -> np.ndarray:
        """
        Transform labels to normalized encoding.

        Args:
            y: Target values

        Returns:
            Encoded labels
        """
        if self.classes_ is None:
            raise ValueError("This LabelEncoder instance is not fitted yet.")

        y = np.asarray(y)
        encoded = np.zeros(len(y), dtype=int)

        for i, label in enumerate(y):
            if label not in self.class_to_index:
                raise ValueError(f"Label '{label}' not seen during fit.")
            encoded[i] = self.class_to_index[label]

        return encoded



[docs]
    def fit_transform(self, y: Union[List, np.ndarray]) -> np.ndarray:
        """
        Fit label encoder and return encoded labels.

        Args:
            y: Target values

        Returns:
            Encoded labels
        """
        return self.fit(y).transform(y)



[docs]
    def inverse_transform(self, y: np.ndarray) -> np.ndarray:
        """
        Transform labels back to original encoding.

        Args:
            y: Encoded target values

        Returns:
            Original labels
        """
        if self.classes_ is None:
            raise ValueError("This LabelEncoder instance is not fitted yet.")

        y = np.asarray(y)
        return self.classes_[y]





[docs]
class OneHotEncoder:
    """
    Encode categorical features as a one-hot numeric array.

    Examples:
        >>> encoder = OneHotEncoder()
        >>> data = [['cat'], ['dog'], ['cat'], ['bird']]
        >>> encoded = encoder.fit_transform(data)
        >>> print(encoded.shape)  # (4, 3)
    """


[docs]
    def __init__(self, sparse: bool = False, drop: Optional[str] = None):
        """
        Initialize OneHotEncoder.

        Args:
            sparse: Return sparse matrix if True (not implemented yet)
            drop: Strategy to use to drop one category per feature (not implemented yet)
        """
        self.sparse = sparse
        self.drop = drop
        self.categories_ = None
        self.feature_names_in_ = None

        if sparse:
            warnings.warn("Sparse output not implemented yet, will return dense array")



[docs]
    def fit(self, X: Union[List, np.ndarray]) -> "OneHotEncoder":
        """
        Fit OneHotEncoder to X.

        Args:
            X: Input samples

        Returns:
            Self for method chaining
        """
        X = np.asarray(X)
        if X.ndim == 1:
            X = X.reshape(-1, 1)

        self.categories_ = []

        for col_idx in range(X.shape[1]):
            unique_vals = np.unique(X[:, col_idx])
            self.categories_.append(unique_vals)

        return self



[docs]
    def transform(self, X: Union[List, np.ndarray]) -> np.ndarray:
        """
        Transform X using one-hot encoding.

        Args:
            X: Input samples

        Returns:
            One-hot encoded array
        """
        if self.categories_ is None:
            raise ValueError("This OneHotEncoder instance is not fitted yet.")

        X = np.asarray(X)
        if X.ndim == 1:
            X = X.reshape(-1, 1)

        encoded_features = []

        for col_idx in range(X.shape[1]):
            categories = self.categories_[col_idx]
            col_data = X[:, col_idx]

            # Create one-hot for this column
            col_encoded = np.zeros((len(col_data), len(categories)))

            for i, value in enumerate(col_data):
                if value in categories:
                    cat_idx = np.where(categories == value)[0][0]
                    col_encoded[i, cat_idx] = 1
                else:
                    raise ValueError(f"Value '{value}' not seen during fit.")

            encoded_features.append(col_encoded)

        return np.hstack(encoded_features)



[docs]
    def fit_transform(self, X: Union[List, np.ndarray]) -> np.ndarray:
        """
        Fit OneHotEncoder and transform X.

        Args:
            X: Input samples

        Returns:
            One-hot encoded array
        """
        return self.fit(X).transform(X)



[docs]
    def get_feature_names_out(
        self, input_features: Optional[List[str]] = None
    ) -> List[str]:
        """
        Get output feature names for transformation.

        Args:
            input_features: Input feature names

        Returns:
            Output feature names
        """
        if self.categories_ is None:
            raise ValueError("This OneHotEncoder instance is not fitted yet.")

        if input_features is None:
            input_features = [f"x{i}" for i in range(len(self.categories_))]

        feature_names = []
        for feature_idx, categories in enumerate(self.categories_):
            feature_name = input_features[feature_idx]
            for category in categories:
                feature_names.append(f"{feature_name}_{category}")

        return feature_names





[docs]
class StandardScaler:
    """
    Standardize features by removing the mean and scaling to unit variance.

    Examples:
        >>> scaler = StandardScaler()
        >>> X = [[1, 2], [3, 4], [5, 6]]
        >>> X_scaled = scaler.fit_transform(X)
    """


[docs]
    def __init__(self, with_mean: bool = True, with_std: bool = True):
        """
        Initialize StandardScaler.

        Args:
            with_mean: Center the data before scaling
            with_std: Scale the data to unit variance
        """
        self.with_mean = with_mean
        self.with_std = with_std
        self.mean_ = None
        self.scale_ = None



[docs]
    def fit(self, X: np.ndarray) -> "StandardScaler":
        """
        Compute the mean and std to be used for later scaling.

        Args:
            X: Training data

        Returns:
            Self for method chaining
        """
        X = np.asarray(X, dtype=np.float64)

        if self.with_mean:
            self.mean_ = np.mean(X, axis=0)
        else:
            self.mean_ = np.zeros(X.shape[1])

        if self.with_std:
            self.scale_ = np.std(X, axis=0)
            # Avoid division by zero
            self.scale_[self.scale_ == 0] = 1.0
        else:
            self.scale_ = np.ones(X.shape[1])

        return self



[docs]
    def transform(self, X: np.ndarray) -> np.ndarray:
        """
        Perform standardization by centering and scaling.

        Args:
            X: Data to transform

        Returns:
            Transformed data
        """
        if self.mean_ is None or self.scale_ is None:
            raise ValueError("This StandardScaler instance is not fitted yet.")

        X = np.asarray(X, dtype=np.float64)
        return (X - self.mean_) / self.scale_



[docs]
    def fit_transform(self, X: np.ndarray) -> np.ndarray:
        """
        Fit to data, then transform it.

        Args:
            X: Training data

        Returns:
            Transformed data
        """
        return self.fit(X).transform(X)



[docs]
    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
        """
        Scale back the data to the original representation.

        Args:
            X: Transformed data

        Returns:
            Original scale data
        """
        if self.mean_ is None or self.scale_ is None:
            raise ValueError("This StandardScaler instance is not fitted yet.")

        X = np.asarray(X, dtype=np.float64)
        return X * self.scale_ + self.mean_





[docs]
class MinMaxScaler:
    """
    Transform features by scaling each feature to a given range.

    Examples:
        >>> scaler = MinMaxScaler()
        >>> X = [[1, 2], [3, 4], [5, 6]]
        >>> X_scaled = scaler.fit_transform(X)  # Scale to [0, 1]
    """


[docs]
    def __init__(self, feature_range: tuple = (0, 1)):
        """
        Initialize MinMaxScaler.

        Args:
            feature_range: Desired range of transformed data
        """
        self.feature_range = feature_range
        self.min_ = None
        self.scale_ = None
        self.data_min_ = None
        self.data_max_ = None



[docs]
    def fit(self, X: np.ndarray) -> "MinMaxScaler":
        """
        Compute the minimum and maximum to be used for later scaling.

        Args:
            X: Training data

        Returns:
            Self for method chaining
        """
        X = np.asarray(X, dtype=np.float64)

        self.data_min_ = np.min(X, axis=0)
        self.data_max_ = np.max(X, axis=0)

        data_range = self.data_max_ - self.data_min_
        # Avoid division by zero
        data_range[data_range == 0] = 1.0

        feature_min, feature_max = self.feature_range
        self.scale_ = (feature_max - feature_min) / data_range
        self.min_ = feature_min - self.data_min_ * self.scale_

        return self



[docs]
    def transform(self, X: np.ndarray) -> np.ndarray:
        """
        Scale features according to feature_range.

        Args:
            X: Data to transform

        Returns:
            Transformed data
        """
        if self.scale_ is None or self.min_ is None:
            raise ValueError("This MinMaxScaler instance is not fitted yet.")

        X = np.asarray(X, dtype=np.float64)
        return X * self.scale_ + self.min_



[docs]
    def fit_transform(self, X: np.ndarray) -> np.ndarray:
        """
        Fit to data, then transform it.

        Args:
            X: Training data

        Returns:
            Transformed data
        """
        return self.fit(X).transform(X)



[docs]
    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
        """
        Undo the scaling of X according to feature_range.

        Args:
            X: Transformed data

        Returns:
            Original scale data
        """
        if self.scale_ is None or self.min_ is None:
            raise ValueError("This MinMaxScaler instance is not fitted yet.")

        X = np.asarray(X, dtype=np.float64)
        return (X - self.min_) / self.scale_