Fitting ML-model hyperparameters using HYPEROPT

In machine learning, hyperparameters are the parameters of a model, the values ​​of which are set before starting the training process. They can be parameters of the algorithm itself, for example, the depth of the tree in a random forest, the number of neighbors in knn, the weights of neurons in neural networks, and methods for processing features, omissions, etc. They are used to control the learning process, so the selection of the optimal hyperparameters is a very important step in building ML-models, allowing you to improve the accuracy and also to fight overfitting. Today there are several popular approaches to solving the selection problem, for example:





1. . , .  Grid Search  sklearn. :





.. . .





, , . , 100 1000 100 ( , ), 550, GridSearch .





2. . . sklearn  Randomized Search. GridSearch, . , .





3. . . – ( ). .





 hyperopt – python- . 3 : Random Search,  Tree of Parzen Estimators (TPE),  Simulated Annealing – . Hyperopt –, , ., .





hyperopt :





pip install hyperopt
      
      



  – , $50 . , :





from functools import partial
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK

#  
df = pd.read_csv('adult.data.csv')

#  
df.drop_duplicates(inplace=True, ignore_index=True)

#      
X = df.drop(labels=['salary', 'native-country'], axis=1).copy()
y = df['salary'].map({'<=50K':0,'>50K':1}).values
      
      



, , , . ColumnTransformer sklearn, . ( object) SimpleImputer ( , «?») OneHotEncoder ( dummy-). ( ) StandardScaler. .





#   ( object)
#    ( )
num_columns = np.where(X.dtypes != 'object')[0]
cat_columns = np.where(X.dtypes == 'object')[0]

#    
cat_pipe = Pipeline([('imputer', SimpleImputer(missing_values='?', 
                            strategy='most_frequent')),
                     ('ohe', OneHotEncoder(sparse=False, 
                        handle_unknown='ignore'))])

#    
num_pipe = Pipeline([('scaler', StandardScaler())])

#   
transformer = ColumnTransformer(
                           transformers=[('cat', cat_pipe, cat_columns),
                                         ('num', num_pipe, num_columns)], 
                                         remainder='passthrough') 
#  
model = Pipeline([('transformer', transformer),
                  ('lr', LogisticRegression(random_state=1, n_jobs=-1, 
                            solver='liblinear'))])

      
      



hyperopt:





search_space = {
                'lr__penalty' : hp.choice(label='penalty', 
                          options=['l1', 'l2']),
                'lr__C' : hp.loguniform(label='C', 
                        low=-4*np.log(10), 
                        high=2*np.log(10))
                }
      
      



 C  - [- 4ln10, 2ln10], [10-4, 102], [l1, l2].    , , , .





, . , , -:





def objective(params, pipeline,  X_train, y_train):
    """
    -   

    :params: 
    :pipeline: 
    :X_train:  
    :y_train:   
    :return:    -
    """ 

    #        
    pipeline.set_params(**params)

    #   - ( 4-  )
    skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)

    #  -  
    score = cross_val_score(estimator=pipeline, X=X_train, y=y_train, 
                            scoring='roc_auc', cv=skf, n_jobs=-1)

    #  ,    Trials()
    return   {'loss': score.mean(), 'params': params, 'status': STATUS_OK}
      
      



(Trials). , .. , . , , fmin. tpe.suggest – . Random Search tpe.rand.suggest.





#  hyperopt
trials = Trials()
best = fmin( 
          #     
            fn=partial(objective, pipeline=model, X_train=X, y_train=y),
          #     
            space=search_space,
          #  
            algo=tpe.suggest,
          #   
          # (     ) 
            max_evals=40,
          #    
            trials=trials,
          # random state
            rstate=np.random.RandomState(1),
          # progressbar
            show_progressbar=True
        )
      
      



pandas DataFrame :





def df_results(hp_results):
    """
      hyperopt   DataFrame 

    :hp_results:  hyperop
    :return: pandas DataFrame
    """ 

    results = pd.DataFrame([{**x, **x['params']} for x in  hp_results])
    results.drop(labels=['status', 'params'], axis=1, inplace=True)
    results.sort_values(by=['loss'], ascending=False, inplace=True)
    return results

results = df_results(trials.results)
sns.set_context("talk")
plt.figure(figsize=(8, 8))
ax = sns.scatterplot(x='lr__C', y='loss', hue='lr__penalty', 
                                                   data=results);
ax.set_xscale('log')
ax.set_xlim(1e-4, 2e2)
ax.grid()

      
      



, Hyperopt , roc auc, .





Thus, hyperopt is a powerful model tuning tool that is easy and convenient to use. Additional materials can be found in the repository  (for multiple models) and also in  1234 .








All Articles