Source code for gossipcat.lab.GridSearch

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
author:     Ewen Wang
email:      wolfgangwong2012@gmail.com
license:    Apache License 2.0
"""
import pandas as pd 
import xgboost as xgb
import warnings 
warnings.filterwarnings('ignore')


[docs] class GridSearch(object): """Perform a grid search for XGBoost hyper-parameter tuning, focusing on `max_depth`, `subsample`, and `colsample_bytree`. """ def __init__(self, df=None, target=None, features=None, regression=False, if_visualize=False, log_path='grid_search.log'): """ Args: df (pandas.DataFrame): A training set. target (str): The target for supervised machine learning. features (list): The feature list for the model. regression (bool): Whether the machine learning task is regression. if_visualize (bool): Whether the task is to visualize, default False. log_path (str): The logging file. """ super(GridSearch, self).__init__() self.df = df self.target = target self.features = features self.regression = regression self.if_visualize = if_visualize self.log_path = log_path self.generalParams = { 'nfold': 5, 'learning_rate': 0.1, 'n_rounds': 3000, 'early_stopping_rounds': 100, 'maximize': True, 'verbose': 1, 'seed': 123, 'stratified': True } self.treeParams = { 'objective': 'binary:logistic', 'tree_method': 'hist', 'eval_metric': 'aucpr', 'eta': self.generalParams['learning_rate'], 'gamma': 0, 'min_child_weight': 0.01, 'max_depth': 3, 'subsample': 0.75, 'colsample_bytree': 0.75, 'colsample_bylevel': 0.7, 'colsample_bynode': 1, 'lambda': 5, 'alpha': 0.2 } if self.regression: self.treeParams['objective'] = 'reg:squarederror' self.treeParams['eval_metric'] = 'rmse' self.generalParams['maximize'] = False self.generalParams['stratified'] = False self.ascending = True if self.if_visualize: self.get_log() else: self.dtrain = xgb.DMatrix(data=self.df[self.features], label=self.df[self.target], silent=False, nthread=-1)
[docs] def search(self, range_max_depth=range(1, 10, 1), range_subsample=range(50, 91, 5), range_colsample_bytree=range(50, 91, 5)): """To search on the hyper-parameter space. Args: range_max_depth (list): The search space of `max_depth`, default range(1, 10, 1). range_subsample (list): The search space of `subsample`, default range(50, 91, 5). range_colsample_bytree (list): The search space of `colsample_bytree`, default range(50, 91, 5). """ self.range_max_depth = range_max_depth self.range_subsample = range_subsample self.range_colsample_bytree = range_colsample_bytree metric = self.treeParams['eval_metric'] with open(self.log_path, 'w') as f: f.write('max_depth,subsample,colsample_bytree,best_round,train_{}_mean,train_{}_std,test_{}_mean,test_{}_std\n'\ .format(metric, metric, metric, metric)) for d in self.range_max_depth: for s in self.range_subsample: for c in self.range_colsample_bytree: self.treeParams['max_depth'] = d self.treeParams['subsample'] = s/100 self.treeParams['colsample_bytree'] = c/100 cvr = xgb.cv(params=self.treeParams, dtrain=self.dtrain, num_boost_round=self.generalParams['n_rounds'], nfold=self.generalParams['nfold'], stratified=self.generalParams['stratified'], metrics=self.treeParams['eval_metric'], maximize=self.generalParams['maximize'], early_stopping_rounds=self.generalParams['early_stopping_rounds'], verbose_eval=self.generalParams['verbose'], seed=self.generalParams['seed']) with open(self.log_path, 'a') as f: f.write('%d,%f,%f,%d,%f,%f,%f,%f\n' % (self.treeParams['max_depth'], self.treeParams['subsample'], self.treeParams['colsample_bytree'], cvr.index[-1], cvr.tail(1)['train-{}-mean'.format(self.treeParams['eval_metric'])], cvr.tail(1)['train-{}-std'.format(self.treeParams['eval_metric'])], cvr.tail(1)['test-{}-mean'.format(self.treeParams['eval_metric'])], cvr.tail(1)['test-{}-std'.format(self.treeParams['eval_metric'])])) print('done.') return None
def get_log(self): self.data = pd.read_csv(self.log_path) self.get_best() return None def get_last(self): print('the lastest results:') return self.data.iloc[-1:] def get_best(self): print('the best results:') return self.data.sort_values(by='test_{}_mean'.format(self.treeParams['eval_metric']), ascending=self.ascending).head(1) def get_top(self, top): print('the top %d results:' % top) return self.data.sort_values(by='test_{}_mean'.format(self.treeParams['eval_metric']), ascending=self.ascending).head(top)
[docs] def visualize(self, max_depth=1, top=1): """To visualize the grid search results in 3D format. The x-axis: `subsample`, the y-axis: `colsample_bytree`, and the z-axis: the mean of cross-validation test score. Args: max_depth (int): The `max_depth` for the 3D visualization. top (int): The top results to print out. Return: The top results of grid search. """ from mpl_toolkits.mplot3d import Axes3D import matplotlib.pyplot as plt from matplotlib import cm df = self.data[self.data.max_depth == max_depth] x = 'subsample'; y = 'colsample_bytree'; z = 'test_{}_mean'.format(self.treeParams['eval_metric']) fig = plt.figure(figsize=(8, 8)) ax = Axes3D(fig) surf = ax.plot_trisurf(df[x], df[y], df[z], cmap=cm.coolwarm, linewidth=0, antialiased=False) ax.set_xlabel(x) ax.set_ylabel(y) ax.set_zlabel(z) fig.colorbar(surf, shrink=.5, aspect=5) plt.title('Grid Search Visualization (max_depth: %d)' %(max_depth)) plt.show() return df.sort_values(by=z, ascending=self.ascending).head(top)
if __name__ == '__main__': main()