Numerical Transformers
Why Transform Numerical Features?
Numerical features often need transformation to:
- Normalize the scale of different features
- Handle skewed distributions
- Create more useful representations
- Meet assumptions of certain algorithms
Standardization
Standardization (z-score normalization) transforms features to have mean=0 and standard deviation=1.
StandardScaler
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
# Sample data
data = {
'height': [165, 180, 175, 160, 185], # in cm
'weight': [60, 85, 75, 55, 90] # in kg
}
df = pd.DataFrame(data)
# Create and apply scaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)
# Convert to DataFrame for better display
scaled_df = pd.DataFrame(
scaled_data,
columns=df.columns
)
print("Original data:")
print(df)
print("
Scaled data:")
print(scaled_df)
print(f"
Mean: {scaled_df.mean()}")
print(f"Std: {scaled_df.std()}")Formula:
z = (x - μ) / σWhere:
- z is the standardized value
- x is the original value
- μ is the mean of the feature
- σ is the standard deviation of the feature
When to use: For algorithms sensitive to feature scales, like SVM, KNN, and neural networks.
RobustScaler
RobustScaler uses statistics that are robust to outliers (median and interquartile range).
from sklearn.preprocessing import RobustScaler
import numpy as np
import pandas as pd
# Sample data with outliers
data = {
'salary': [50000, 55000, 60000, 65000, 500000], # last value is outlier
'age': [25, 30, 35, 40, 90] # last value is outlier
}
df = pd.DataFrame(data)
# Create and apply scaler
robust_scaler = RobustScaler()
robust_scaled = robust_scaler.fit_transform(df)
# Convert to DataFrame for better display
robust_df = pd.DataFrame(
robust_scaled,
columns=df.columns
)
print("Original data:")
print(df)
print("
Robust scaled data:")
print(robust_df)Formula:
z = (x - median) / IQRWhere:
- IQR is the interquartile range (75th percentile - 25th percentile)
When to use: When your data contains outliers that would skew standard scaling.
Normalization (Min-Max Scaling)
Scales features to a specific range, typically [0,1].
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
# Sample data
data = {
'score1': [10, 20, 30, 40, 50],
'score2': [100, 200, 300, 400, 500]
}
df = pd.DataFrame(data)
# Create and apply scaler
min_max_scaler = MinMaxScaler()
normalized = min_max_scaler.fit_transform(df)
# Convert to DataFrame for better display
normalized_df = pd.DataFrame(
normalized,
columns=df.columns
)
print("Original data:")
print(df)
print("
Normalized data:")
print(normalized_df)
print(f"
Min: {normalized_df.min()}")
print(f"Max: {normalized_df.max()}")Formula:
x_scaled = (x - min(x)) / (max(x) - min(x))When to use: When you need bounded values, like for neural networks with sigmoid activation or algorithms that require positive values.
Discretization
Discretization transforms continuous features into discrete ones by creating bins.
KBinsDiscretizer
from sklearn.preprocessing import KBinsDiscretizer
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Generate sample data
np.random.seed(42)
data = np.random.normal(loc=50, scale=15, size=1000).reshape(-1, 1)
# Create discretizer with different strategies
n_bins = 5
discretizers = {
'uniform': KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform'),
'quantile': KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='quantile'),
'kmeans': KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='kmeans')
}
# Apply discretization
results = {}
for name, discretizer in discretizers.items():
results[name] = discretizer.fit_transform(data)
# Print bin edges for each strategy
for name, discretizer in discretizers.items():
print(f"{name} bin edges: {discretizer.bin_edges_[0]}")
# Create one-hot encoded version
onehot_discretizer = KBinsDiscretizer(n_bins=n_bins, encode='onehot', strategy='quantile')
onehot_result = onehot_discretizer.fit_transform(data)
print(f"
One-hot encoded shape: {onehot_result.shape}")Discretization Strategies:
- uniform: Equal-width bins
- quantile: Equal-frequency bins
- kmeans: Bins based on K-means clustering
Encoding Options:
- ordinal: Integer values
- onehot: One-hot encoded values
- onehot-dense: One-hot encoded dense array
When to use:
- When the relationship between features and target is non-linear
- To convert continuous features into categorical ones
- To handle outliers by grouping extreme values
Polynomial Features
Adds polynomial and interaction features to capture non-linear relationships.
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import pandas as pd
# Sample data
data = {
'length': [10, 20, 30, 40],
'width': [5, 10, 15, 20]
}
df = pd.DataFrame(data)
# Create polynomial features
poly = PolynomialFeatures(degree=2, include_bias=True)
poly_features = poly.fit_transform(df)
# Get feature names
feature_names = poly.get_feature_names_out(df.columns)
# Convert to DataFrame for better display
poly_df = pd.DataFrame(
poly_features,
columns=feature_names
)
print("Original data:")
print(df)
print("
Polynomial features:")
print(poly_df)What it creates:
- Original features
- Squared terms (x²)
- Interaction terms (x₁x₂)
- Higher-order terms (x³, x₁²x₂, etc.) for degree > 2
When to use:
- When relationships between features and target are non-linear
- For algorithms like linear regression to model non-linear relationships
- When feature interactions might be important
Log Transformation
Useful for handling skewed data and making relationships more linear.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Sample skewed data
np.random.seed(42)
data = {
'income': np.random.exponential(scale=50000, size=1000),
'price': np.random.lognormal(mean=10, sigma=1, size=1000)
}
df = pd.DataFrame(data)
# Apply log transformation
df['log_income'] = np.log1p(df['income']) # log(1+x) to handle zeros
df['log_price'] = np.log1p(df['price'])
# Compare statistics
print("Original data statistics:")
print(df[['income', 'price']].describe())
print("
Log-transformed data statistics:")
print(df[['log_income', 'log_price']].describe())When to use:
- For heavily skewed distributions
- To make relationships more linear
- For features with multiplicative effects
- When data spans multiple orders of magnitude
Power Transformers
Transforms data to be more Gaussian-like.
from sklearn.preprocessing import PowerTransformer
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Generate skewed data
np.random.seed(42)
data = {
'skewed': np.random.exponential(scale=2, size=1000).reshape(-1, 1)
}
df = pd.DataFrame(data)
# Apply power transformers
yj = PowerTransformer(method='yeo-johnson')
bc = PowerTransformer(method='box-cox') # only works with positive data
# Transform data
df['yeo_johnson'] = yj.fit_transform(df[['skewed']])
df['box_cox'] = bc.fit_transform(df[['skewed']])
# Print lambdas (transformation parameters)
print(f"Yeo-Johnson lambda: {yj.lambdas_}")
print(f"Box-Cox lambda: {bc.lambdas_}")Available Methods:
- Yeo-Johnson: Works with both positive and negative values
- Box-Cox: Only works with strictly positive values
When to use:
- When you need data to be normally distributed
- For algorithms that assume normality
- When you want to stabilize variance
Custom Transformers
You can create custom transformers by extending BaseEstimator and TransformerMixin.
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
class LogTransformer(BaseEstimator, TransformerMixin):
"""Custom transformer that applies log(1+x) to selected columns"""
def __init__(self, columns=None):
self.columns = columns
def fit(self, X, y=None):
return self
def transform(self, X):
X_copy = X.copy()
cols = self.columns if self.columns else X_copy.columns
for col in cols:
if col in X_copy.columns:
X_copy[col] = np.log1p(X_copy[col])
return X_copy
# Sample data
data = {
'normal': [1, 2, 3, 4, 5],
'skewed': [10, 100, 1000, 10000, 100000]
}
df = pd.DataFrame(data)
# Apply custom transformer
log_transformer = LogTransformer(columns=['skewed'])
transformed_df = log_transformer.transform(df)
print("Original data:")
print(df)
print("
Transformed data:")
print(transformed_df)When to use:
- When you need custom transformation logic
- To combine multiple transformations
- When built-in transformers don't meet your needs
Combining Transformers in Pipelines
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np
# Sample data
data = {
'length': [10, 20, 30, 40],
'width': [5, 10, 15, 20],
'height': [2, 4, 6, 8]
}
df = pd.DataFrame(data)
# Create pipeline for numerical features
numeric_pipeline = Pipeline([
('poly', PolynomialFeatures(degree=2, include_bias=False)),
('scaler', StandardScaler())
])
# Apply pipeline
transformed_data = numeric_pipeline.fit_transform(df)
# Get feature names
poly = PolynomialFeatures(degree=2, include_bias=False)
feature_names = poly.fit(df).get_feature_names_out(df.columns)
# Create DataFrame for display
transformed_df = pd.DataFrame(
transformed_data,
columns=feature_names
)
print("Original data:")
print(df)
print("
Transformed data (first few columns):")
print(transformed_df.iloc[:, :5])