Source code for gossipcat.lab.GridSearch

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
author:     Ewen Wang
email:      wolfgangwong2012@gmail.com
license:    Apache License 2.0
"""
import pandas as pd 
import xgboost as xgb
import warnings 
warnings.filterwarnings('ignore')



[docs]
class GridSearch(object):
    """Perform a grid search for XGBoost hyper-parameter tuning, focusing on `max_depth`, `subsample`, and `colsample_bytree`.
    """
    def __init__(self, df=None, target=None, features=None, regression=False, if_visualize=False, log_path='grid_search.log'):
        """

        Args:
            df (pandas.DataFrame): A training set. 
            target (str): The target for supervised machine learning.
            features (list): The feature list for the model.
            regression (bool): Whether the machine learning task is regression.
            if_visualize (bool): Whether the task is to visualize, default False.
            log_path (str): The logging file. 
        """
        super(GridSearch, self).__init__()

        self.df = df
        self.target = target
        self.features = features
        self.regression = regression
        self.if_visualize = if_visualize
        self.log_path = log_path

        self.generalParams = {
            'nfold': 5,
            'learning_rate': 0.1,
            'n_rounds': 3000,
            'early_stopping_rounds': 100,
            'maximize': True,
            'verbose': 1,
            'seed': 123,
            'stratified': True
        }

        self.treeParams = {
            'objective': 'binary:logistic',
            'tree_method': 'hist',
            'eval_metric': 'aucpr',
            'eta': self.generalParams['learning_rate'],
            'gamma': 0,
            'min_child_weight': 0.01,
            'max_depth': 3,
            'subsample': 0.75,
            'colsample_bytree': 0.75,
            'colsample_bylevel': 0.7,
            'colsample_bynode': 1,
            'lambda': 5,
            'alpha': 0.2
        }

        if self.regression:
            self.treeParams['objective'] = 'reg:squarederror'
            self.treeParams['eval_metric'] = 'rmse'
            self.generalParams['maximize'] = False
            self.generalParams['stratified'] = False
            self.ascending = True

        if self.if_visualize:
            self.get_log()
        else:
            self.dtrain = xgb.DMatrix(data=self.df[self.features], label=self.df[self.target], silent=False, nthread=-1)



[docs]
    def search(self, 
               range_max_depth=range(1, 10, 1),
               range_subsample=range(50, 91, 5),
               range_colsample_bytree=range(50, 91, 5)):
        """To search on the hyper-parameter space.

        Args:
            range_max_depth (list): The search space of `max_depth`, default range(1, 10, 1). 
            range_subsample (list): The search space of `subsample`, default range(50, 91, 5). 
            range_colsample_bytree (list): The search space of `colsample_bytree`, default range(50, 91, 5). 
        """
        self.range_max_depth = range_max_depth
        self.range_subsample = range_subsample
        self.range_colsample_bytree = range_colsample_bytree

        metric = self.treeParams['eval_metric']

        with open(self.log_path, 'w') as f:
            f.write('max_depth,subsample,colsample_bytree,best_round,train_{}_mean,train_{}_std,test_{}_mean,test_{}_std\n'\
                .format(metric, metric, metric, metric))

        for d in self.range_max_depth:
            for s in self.range_subsample:
                for c in self.range_colsample_bytree:
                    self.treeParams['max_depth'] = d
                    self.treeParams['subsample'] = s/100
                    self.treeParams['colsample_bytree'] = c/100

                    cvr = xgb.cv(params=self.treeParams,
                                 dtrain=self.dtrain,
                                 num_boost_round=self.generalParams['n_rounds'],
                                 nfold=self.generalParams['nfold'],
                                 stratified=self.generalParams['stratified'],
                                 metrics=self.treeParams['eval_metric'],
                                 maximize=self.generalParams['maximize'],
                                 early_stopping_rounds=self.generalParams['early_stopping_rounds'],
                                 verbose_eval=self.generalParams['verbose'],
                                 seed=self.generalParams['seed'])
                    with open(self.log_path, 'a') as f:
                        f.write('%d,%f,%f,%d,%f,%f,%f,%f\n' % (self.treeParams['max_depth'], 
                                                               self.treeParams['subsample'], 
                                                               self.treeParams['colsample_bytree'], 
                                                               cvr.index[-1],
                                                               cvr.tail(1)['train-{}-mean'.format(self.treeParams['eval_metric'])],
                                                               cvr.tail(1)['train-{}-std'.format(self.treeParams['eval_metric'])],
                                                               cvr.tail(1)['test-{}-mean'.format(self.treeParams['eval_metric'])],
                                                               cvr.tail(1)['test-{}-std'.format(self.treeParams['eval_metric'])]))
        print('done.')
        return None


    def get_log(self):
        self.data = pd.read_csv(self.log_path)
        self.get_best()
        return None

    def get_last(self):
        print('the lastest results:')
        return self.data.iloc[-1:]

    def get_best(self):
        print('the best results:')
        return self.data.sort_values(by='test_{}_mean'.format(self.treeParams['eval_metric']), ascending=self.ascending).head(1)

    def get_top(self, top):
        print('the top %d results:' % top)
        return self.data.sort_values(by='test_{}_mean'.format(self.treeParams['eval_metric']), ascending=self.ascending).head(top)


[docs]
    def visualize(self, max_depth=1, top=1):
        """To visualize the grid search results in 3D format. The x-axis: `subsample`, the y-axis: `colsample_bytree`, and the z-axis: the mean of cross-validation test score.

        Args:
            max_depth (int): The `max_depth` for the 3D visualization.
            top (int): The top results to print out.

        Return:
            The top results of grid search.
        """
        from mpl_toolkits.mplot3d import Axes3D
        import matplotlib.pyplot as plt 
        from matplotlib import cm

        df = self.data[self.data.max_depth == max_depth]
        x = 'subsample'; y = 'colsample_bytree'; z = 'test_{}_mean'.format(self.treeParams['eval_metric'])

        fig = plt.figure(figsize=(8, 8))
        ax = Axes3D(fig)
        surf = ax.plot_trisurf(df[x], df[y], df[z], 
                               cmap=cm.coolwarm, linewidth=0, antialiased=False)
        ax.set_xlabel(x)
        ax.set_ylabel(y)
        ax.set_zlabel(z)
        fig.colorbar(surf, shrink=.5, aspect=5)
        plt.title('Grid Search Visualization (max_depth: %d)' %(max_depth))
        plt.show()
        return df.sort_values(by=z, ascending=self.ascending).head(top)




if __name__ == '__main__':
    main()