Skip to the content.

GridSearchCV

下面來舉一個簡單的使用範例

from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC


X, y = datasets.load_digits(return_X_y=True)
n_samples = len(X)
X = X.reshape((n_samples, -1))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=87)

# 設置搜尋參數
tuned_parameters = [
    {'kernel': ['rbf'], 
     'gamma': [1e-3, 1e-4, 'scale'],
     'C': [1, 10, 100, 1000],
     },
    {'kernel': ['linear'], 
     'C': [1, 10, 100, 1000],
     },
    {'kernel': ['poly'],
     'gamma': ['scale', 'auto'],
     'degree': [2, 3, 4, 5, 6],
     'C': [1, 10, 100, 1000],
     },
]

scores = ['precision_macro', 'recall_micro', 'f1_weighted', 
          'accuracy', 'balanced_accuracy']

for score in scores:
    print("# Tuning hyper-parameters for %s \n" % score)

    classifier = GridSearchCV(
        SVC(), tuned_parameters, scoring='%s' % score
    ).fit(X_train, y_train)

    print("Best parameters set found on development set: \n")
    print(classifier.best_params_)
    print("\nGrid scores on development set:\n")
    
    means = classifier.cv_results_['mean_test_score']
    stds = classifier.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, classifier.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    
    print("\nDetailed classification report:\n")
    y_pred = classifier.predict(X_test)
    print(classification_report(y_test, y_pred))
    

可以查詢有哪些是可以使用的 scorer

from sklearn import metrics

metrics.get_scorer_names()

XGboost with GridSearchCV

更多參數請參考官網

下面給一個 XGboost + Pipeline + GridSearchCV 的使用範例, 有其他需求可以拿他來改。

from xgboost import XGBClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


X, y = datasets.load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=87)

# 設置搜尋參數
tuned_parameters = [
    {'classifier__n_estimators': [100, 200, 300], 
     'classifier__max_depth': [1, 2, 3],
     'classifier__min_child_weight': [1, 2, 3],
     },
]

# 設置 Pipeline
estimators = [
    ('reduce_dim', PCA()), 
    ('scaler', StandardScaler()),
    ('classifier', XGBClassifier())
]
pipeline = Pipeline(estimators)


scores = ['accuracy']

for score in scores:
    print("# Tuning hyper-parameters for %s \n" % score)

    classifier = GridSearchCV(
        pipeline, 
        tuned_parameters, 
        scoring='%s' % score,
        cv=3
    ).fit(X_train, y_train)

    print("Best parameters set found on development set: \n")
    print(classifier.best_params_)
    print("\nGrid scores on development set:\n")
    
    means = classifier.cv_results_['mean_test_score']
    stds = classifier.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, classifier.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    
    print("\nDetailed classification report:\n")
    y_pred = classifier.predict(X_test)
    print(classification_report(y_test, y_pred))


LightGBM with GridSearchCV

更多參數請參考官網

下面也是給個使用範例

import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


X, y = datasets.load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=87)

# 設置搜尋參數
tuned_parameters = [
    {'classifier__n_estimators': [100, 200, 300], 
     'classifier__max_depth': [1, 2, 3],
     'classifier__min_child_weight': [1, 2, 3],
     },
]

# 設置 Pipeline
estimators = [
    ('reduce_dim', PCA()), 
    ('scaler', StandardScaler()),
    ('classifier', LGBMClassifier())
]
pipeline = Pipeline(estimators)


scores = ['accuracy']

for score in scores:
    print("# Tuning hyper-parameters for %s \n" % score)

    classifier = GridSearchCV(
        pipeline, 
        tuned_parameters, 
        scoring='%s' % score,
        cv=3
    ).fit(X_train, y_train)

    print("Best parameters set found on development set: \n")
    print(classifier.best_params_)
    print("\nGrid scores on development set:\n")
    
    means = classifier.cv_results_['mean_test_score']
    stds = classifier.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, classifier.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    
    print("\nDetailed classification report:\n")
    y_pred = classifier.predict(X_test)
    print(classification_report(y_test, y_pred))
    

RandomizedSearchCV

在很多時候我們並不知道那些參數好,使用 RandomizedSearchCV, 可以讓收尋空間是某個 distribution 。

下面是一個使用範例

from sklearn import datasets
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
import numpy as np
from scipy.stats import uniform
from sklearn.utils.fixes import loguniform


X, y = datasets.load_digits(return_X_y=True)
n_samples = len(X)
X = X.reshape((n_samples, -1))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=87)

# 設置搜尋參數
tuned_parameters = [
    {'kernel': ['rbf'], 
     'gamma': [1e-3, 1e-4, 'scale'],
     'C': uniform(loc=1, scale=999),                  # 均勻分布 
     },
    {'kernel': ['linear'], 
     'C': np.logspace(0, 3, base=10, num = 100),      # log 均勻分布  base^0  ~ base^3
     },
    {'kernel': ['poly'],
     'gamma': ['scale', 'auto'],
     'degree': [2, 3, 4, 5, 6],
     'C': loguniform(1e0, 1e3),                        # 也是 log 均勻分布
     },
]

scores = ['precision_macro', 'recall_micro', 'f1_weighted', 
          'accuracy', 'balanced_accuracy']

for score in scores:
    print("# Tuning hyper-parameters for %s \n" % score)

    classifier = RandomizedSearchCV(
        SVC(), tuned_parameters, scoring='%s' % score
    ).fit(X_train, y_train)

    print("Best parameters set found on development set: \n")
    print(classifier.best_params_)
    print("\nGrid scores on development set:\n")
    
    means = classifier.cv_results_['mean_test_score']
    stds = classifier.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, classifier.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    
    print("\nDetailed classification report:\n")
    y_pred = classifier.predict(X_test)
    print(classification_report(y_test, y_pred))