Source code for fit.data.preprocessing

"""
Data preprocessing utilities for machine learning.

This module provides tools for encoding categorical variables,
scaling features, and other preprocessing tasks.
"""

import numpy as np
from typing import Dict, List, Optional, Union, Any
import warnings


[docs] class LabelEncoder: """ Encode target labels with value between 0 and n_classes-1. Examples: >>> encoder = LabelEncoder() >>> labels = ['cat', 'dog', 'cat', 'bird'] >>> encoded = encoder.fit_transform(labels) >>> print(encoded) # [0, 1, 0, 2] >>> decoded = encoder.inverse_transform(encoded) >>> print(decoded) # ['cat', 'dog', 'cat', 'bird'] """
[docs] def __init__(self): self.classes_ = None self.class_to_index = None
[docs] def fit(self, y: Union[List, np.ndarray]) -> "LabelEncoder": """ Fit label encoder. Args: y: Target values Returns: Self for method chaining """ y = np.asarray(y) self.classes_ = np.unique(y) self.class_to_index = {cls: idx for idx, cls in enumerate(self.classes_)} return self
[docs] def transform(self, y: Union[List, np.ndarray]) -> np.ndarray: """ Transform labels to normalized encoding. Args: y: Target values Returns: Encoded labels """ if self.classes_ is None: raise ValueError("This LabelEncoder instance is not fitted yet.") y = np.asarray(y) encoded = np.zeros(len(y), dtype=int) for i, label in enumerate(y): if label not in self.class_to_index: raise ValueError(f"Label '{label}' not seen during fit.") encoded[i] = self.class_to_index[label] return encoded
[docs] def fit_transform(self, y: Union[List, np.ndarray]) -> np.ndarray: """ Fit label encoder and return encoded labels. Args: y: Target values Returns: Encoded labels """ return self.fit(y).transform(y)
[docs] def inverse_transform(self, y: np.ndarray) -> np.ndarray: """ Transform labels back to original encoding. Args: y: Encoded target values Returns: Original labels """ if self.classes_ is None: raise ValueError("This LabelEncoder instance is not fitted yet.") y = np.asarray(y) return self.classes_[y]
[docs] class OneHotEncoder: """ Encode categorical features as a one-hot numeric array. Examples: >>> encoder = OneHotEncoder() >>> data = [['cat'], ['dog'], ['cat'], ['bird']] >>> encoded = encoder.fit_transform(data) >>> print(encoded.shape) # (4, 3) """
[docs] def __init__(self, sparse: bool = False, drop: Optional[str] = None): """ Initialize OneHotEncoder. Args: sparse: Return sparse matrix if True (not implemented yet) drop: Strategy to use to drop one category per feature (not implemented yet) """ self.sparse = sparse self.drop = drop self.categories_ = None self.feature_names_in_ = None if sparse: warnings.warn("Sparse output not implemented yet, will return dense array")
[docs] def fit(self, X: Union[List, np.ndarray]) -> "OneHotEncoder": """ Fit OneHotEncoder to X. Args: X: Input samples Returns: Self for method chaining """ X = np.asarray(X) if X.ndim == 1: X = X.reshape(-1, 1) self.categories_ = [] for col_idx in range(X.shape[1]): unique_vals = np.unique(X[:, col_idx]) self.categories_.append(unique_vals) return self
[docs] def transform(self, X: Union[List, np.ndarray]) -> np.ndarray: """ Transform X using one-hot encoding. Args: X: Input samples Returns: One-hot encoded array """ if self.categories_ is None: raise ValueError("This OneHotEncoder instance is not fitted yet.") X = np.asarray(X) if X.ndim == 1: X = X.reshape(-1, 1) encoded_features = [] for col_idx in range(X.shape[1]): categories = self.categories_[col_idx] col_data = X[:, col_idx] # Create one-hot for this column col_encoded = np.zeros((len(col_data), len(categories))) for i, value in enumerate(col_data): if value in categories: cat_idx = np.where(categories == value)[0][0] col_encoded[i, cat_idx] = 1 else: raise ValueError(f"Value '{value}' not seen during fit.") encoded_features.append(col_encoded) return np.hstack(encoded_features)
[docs] def fit_transform(self, X: Union[List, np.ndarray]) -> np.ndarray: """ Fit OneHotEncoder and transform X. Args: X: Input samples Returns: One-hot encoded array """ return self.fit(X).transform(X)
[docs] def get_feature_names_out( self, input_features: Optional[List[str]] = None ) -> List[str]: """ Get output feature names for transformation. Args: input_features: Input feature names Returns: Output feature names """ if self.categories_ is None: raise ValueError("This OneHotEncoder instance is not fitted yet.") if input_features is None: input_features = [f"x{i}" for i in range(len(self.categories_))] feature_names = [] for feature_idx, categories in enumerate(self.categories_): feature_name = input_features[feature_idx] for category in categories: feature_names.append(f"{feature_name}_{category}") return feature_names
[docs] class StandardScaler: """ Standardize features by removing the mean and scaling to unit variance. Examples: >>> scaler = StandardScaler() >>> X = [[1, 2], [3, 4], [5, 6]] >>> X_scaled = scaler.fit_transform(X) """
[docs] def __init__(self, with_mean: bool = True, with_std: bool = True): """ Initialize StandardScaler. Args: with_mean: Center the data before scaling with_std: Scale the data to unit variance """ self.with_mean = with_mean self.with_std = with_std self.mean_ = None self.scale_ = None
[docs] def fit(self, X: np.ndarray) -> "StandardScaler": """ Compute the mean and std to be used for later scaling. Args: X: Training data Returns: Self for method chaining """ X = np.asarray(X, dtype=np.float64) if self.with_mean: self.mean_ = np.mean(X, axis=0) else: self.mean_ = np.zeros(X.shape[1]) if self.with_std: self.scale_ = np.std(X, axis=0) # Avoid division by zero self.scale_[self.scale_ == 0] = 1.0 else: self.scale_ = np.ones(X.shape[1]) return self
[docs] def transform(self, X: np.ndarray) -> np.ndarray: """ Perform standardization by centering and scaling. Args: X: Data to transform Returns: Transformed data """ if self.mean_ is None or self.scale_ is None: raise ValueError("This StandardScaler instance is not fitted yet.") X = np.asarray(X, dtype=np.float64) return (X - self.mean_) / self.scale_
[docs] def fit_transform(self, X: np.ndarray) -> np.ndarray: """ Fit to data, then transform it. Args: X: Training data Returns: Transformed data """ return self.fit(X).transform(X)
[docs] def inverse_transform(self, X: np.ndarray) -> np.ndarray: """ Scale back the data to the original representation. Args: X: Transformed data Returns: Original scale data """ if self.mean_ is None or self.scale_ is None: raise ValueError("This StandardScaler instance is not fitted yet.") X = np.asarray(X, dtype=np.float64) return X * self.scale_ + self.mean_
[docs] class MinMaxScaler: """ Transform features by scaling each feature to a given range. Examples: >>> scaler = MinMaxScaler() >>> X = [[1, 2], [3, 4], [5, 6]] >>> X_scaled = scaler.fit_transform(X) # Scale to [0, 1] """
[docs] def __init__(self, feature_range: tuple = (0, 1)): """ Initialize MinMaxScaler. Args: feature_range: Desired range of transformed data """ self.feature_range = feature_range self.min_ = None self.scale_ = None self.data_min_ = None self.data_max_ = None
[docs] def fit(self, X: np.ndarray) -> "MinMaxScaler": """ Compute the minimum and maximum to be used for later scaling. Args: X: Training data Returns: Self for method chaining """ X = np.asarray(X, dtype=np.float64) self.data_min_ = np.min(X, axis=0) self.data_max_ = np.max(X, axis=0) data_range = self.data_max_ - self.data_min_ # Avoid division by zero data_range[data_range == 0] = 1.0 feature_min, feature_max = self.feature_range self.scale_ = (feature_max - feature_min) / data_range self.min_ = feature_min - self.data_min_ * self.scale_ return self
[docs] def transform(self, X: np.ndarray) -> np.ndarray: """ Scale features according to feature_range. Args: X: Data to transform Returns: Transformed data """ if self.scale_ is None or self.min_ is None: raise ValueError("This MinMaxScaler instance is not fitted yet.") X = np.asarray(X, dtype=np.float64) return X * self.scale_ + self.min_
[docs] def fit_transform(self, X: np.ndarray) -> np.ndarray: """ Fit to data, then transform it. Args: X: Training data Returns: Transformed data """ return self.fit(X).transform(X)
[docs] def inverse_transform(self, X: np.ndarray) -> np.ndarray: """ Undo the scaling of X according to feature_range. Args: X: Transformed data Returns: Original scale data """ if self.scale_ is None or self.min_ is None: raise ValueError("This MinMaxScaler instance is not fitted yet.") X = np.asarray(X, dtype=np.float64) return (X - self.min_) / self.scale_