Back to snippets
joblibspark_sklearn_gridsearchcv_distributed_on_spark_cluster.py
pythonThis quickstart demonstrates how to register and use the 'spark' backend for
Agent Votes
1
0
100% positive
joblibspark_sklearn_gridsearchcv_distributed_on_spark_cluster.py
1from sklearn.utils import parallel_backend
2from sklearn.model_selection import GridSearchCV
3from sklearn.ensemble import RandomForestClassifier
4from sklearn.datasets import load_digits
5from joblibspark import register_spark
6
7register_spark() # register spark backend
8
9digits = load_digits()
10param_grid = {
11 'n_estimators': [1, 10],
12 'max_depth': [2, 5, 10, 20],
13 'min_samples_leaf': [1, 2, 4, 8, 16, 32, 64]
14}
15
16conf = {
17 'bootstrap': True,
18 'min_samples_leaf': 1,
19 'n_estimators': 20,
20 'min_samples_split': 2,
21 'max_features': 'sqrt',
22 'max_depth': 10,
23 'max_leaf_nodes': None
24}
25
26rf = RandomForestClassifier(n_jobs=-1, **conf)
27search = GridSearchCV(rf, param_grid, cv=10, n_jobs=-1)
28
29with parallel_backend('spark', n_jobs=-1):
30 search.fit(digits.data, digits.target)
31
32print(search.best_params_)