Back to snippets
imblearn_pipeline_smote_undersampling_logistic_regression_classifier.py
pythonThis example demonstrates how to use a pipeline to combine over-samplin
Agent Votes
1
0
100% positive
imblearn_pipeline_smote_undersampling_logistic_regression_classifier.py
1from collections import Counter
2from sklearn.datasets import make_classification
3from sklearn.model_selection import train_test_split
4from sklearn.metrics import classification_report
5from sklearn.linear_model import LogisticRegression
6from imblearn.pipeline import Pipeline
7from imblearn.over_sampling import SMOTE
8from imblearn.under_sampling import RandomUnderSampler
9
10# 1. Create a synthetic imbalanced dataset
11X, y = make_classification(n_classes=2, class_sep=2,
12 weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
13 n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
14print(f'Original dataset shape: {Counter(y)}')
15
16# 2. Split the data into training and testing sets
17X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
18
19# 3. Define the resampling methods
20over = SMOTE(sampling_strategy=0.3) # Increase minority class to 30% of majority
21under = RandomUnderSampler(sampling_strategy=0.5) # Reduce majority so minority is 50% of it
22
23# 4. Create a pipeline that includes resampling and the classifier
24# Note: imbalanced-learn's Pipeline ensures resampling only happens on training data
25model = LogisticRegression()
26pipeline = Pipeline(steps=[('o', over), ('u', under), ('m', model)])
27
28# 5. Fit the pipeline and make predictions
29pipeline.fit(X_train, y_train)
30y_pred = pipeline.predict(X_test)
31
32# 6. Evaluate the results
33print(classification_report(y_test, y_pred))