Source code for gossipcat.dev.CAT

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
author:     Ewen Wang
email:      wolfgangwong2012@gmail.com
license:    Apache License 2.0
"""
import warnings
warnings.filterwarnings('ignore')
import random
random.seed(0)

import time
import json

import pandas as pd 
import matplotlib.pyplot as plt

import catboost as cb

[docs] class CAT(object): """Quickly develop a CatBoost model with best-practice parameters.""" def __init__(self, df, indcol, target, features, features_cat, regression=False, predicting=False, multi=0, balanced=0, gpu=0, seed=0): """ Args: df (pandas.DataFrame): A DataFrame for modeling. indcol (str): The indicator column name for the dataset. target (str): The target column name. features (list): The feature list. features_cat (list): Categorical feature list. predicting (bool): Whether a predicting task, default False. balance (bool): Whether the sample is balanced for binary classification task, default False. multi (bool): Whether a multi-category task, default False. gpu (bool): Whether to use GPU, default False. seed (int): The seed for randomness. """ super(CAT, self).__init__() self.df = df self.indcol = indcol self.features = features self.features_cat = features_cat self.regression = regression self.predicting = predicting self.df[self.features_cat] = self.df[self.features_cat].fillna('NaN') if self.predicting: self.target = None self.dtest = cb.Pool(data=self.df[self.features], cat_features=self.features_cat) else: self.target = target self.dtrain = cb.Pool(data=self.df[self.features], label=self.df[self.target], cat_features=self.features_cat) self.multi = multi self.balanced = balanced self.gpu = gpu self.seed = seed self.params = {} if self.regression: self.params['loss_function'] = 'RMSE' else: self.params['loss_function'] = 'Logloss' self.cvr = pd.DataFrame() self.prediction = pd.DataFrame() self.prediction[self.indcol] = self.df[self.indcol]
[docs] def algorithm(self, learning_rate=0.01, iterations=100, early_stopping_rounds=20, nfold=10, verbose=100, plot=False): """Perform cross-validation on the training set. Args: learning_rate (float): Boosting learning rate (xgb’s “eta”). iterations (int): Number of boosting iterations. early_stopping (int): Activates early stopping. Cross-Validation metric (average of validation metric computed over CV folds) needs to improve at least once in every early_stopping_rounds round(s) to continue training. The last entry in the evaluation history will represent the best iteration. If there’s more than one metric in the eval_metric parameter given in params, the last metric will be used for early stopping. n_fold (int): Number of folds in CV. verbose (bool, int, or None): Whether to display the progress. If None, progress will be displayed when np.ndarray is returned. If True, progress will be displayed at boosting stage. If an integer is given, progress will be displayed at every given verbose_eval boosting stage. plot (bool): Whether plot the output, default False. """ self.params['learning_rate'] = learning_rate self.params['iterations'] = iterations self.params['early_stopping_rounds'] = early_stopping_rounds self.params['verbose'] = verbose message = 'cross validation started and will stop if performace did not improve in {} rounds.'.format(early_stopping_rounds) print(message) self.cvr = cb.cv(dtrain=self.dtrain, params=self.params, nfold=nfold, seed=self.seed, plot=plot) col_loss = 'test-{}-mean'.format(self.params['loss_function']) self.n_rounds = self.cvr.sort_values([col_loss, 'iterations'])['iterations'].values[0] loss = self.cvr.sort_values([col_loss, 'iterations'])[col_loss].values[0] self.params['iterations'] = self.n_rounds message = 'cross validation done with number of rounds: {}.'.format(self.n_rounds) print(message) message = 'test {}: {:.3f}'.format(self.params['loss_function'], loss) print(message) return None
[docs] def load_model(self, path_model='model_cb.json', format='json'): """Load a pretrained model. Args: path_model (str): Path of the model. format (str): Model format, default json. """ if self.regression: self.bst = cb.CatBoostRegressor() else: self.bst = cb.CatBoostClassifier() self.bst = self.bst.load_model(fname=path_model, format=format) message = 'model loaded from path: %s' % path_model print(message) return None
[docs] def save_model(self, path_model='model_cb.json', format='json'): """Load a pretrained model. Args: path_model (str): Path of the model. format (str): Model format, default json. """ if path_model == None: pass else: self.bst.save_model(fname=path_model, format=format) return None
[docs] def train(self, path_model='model_cb.json'): """Train a model with the best iteration rounds obtained from `algorithm`. Args: path_model (str): Path to save the model. """ try: message = 'number of training rounds: %d.' % self.n_rounds print(message) except Exception as e: message = 'no hpyter parameters assigned and default assigned.' print(message) self.algorithm() print(json.dumps(self.params, indent=4)) if self.regression: self.bst = cb.CatBoostRegressor(**self.params) else: self.bst = cb.CatBoostClassifier(**self.params) self.bst.fit(self.dtrain) self.save_model(path_model=path_model) self.prediction['pred'] = self.bst.predict(self.dtrain) if self.regression==False: self.prediction['prob'] = self.bst.predict_proba(self.dtrain)[:,1] message = 'prediction done.' print(message) return None
[docs] def predict(self, path_model='model_cb.json', path_result='prediction.csv', model_format='json'): """Predict with model loaded from the path and save it as a CSV file. Args: path_model (str): Path of the model. path_result (str): Path of the prediction. model_format (str): Model format, default json. """ self.load_model(path_model=path_model, format=model_format) self.prediction['pred'] = self.bst.predict(self.dtest) if self.regression==False: self.prediction['prob'] = self.bst.predict_proba(self.dtest)[:,1] message = 'prediction done.' print(message) if path_result == None: pass else: self.prediction.to_csv(path_result, index=False) message = 'results saved in path: %s' % path_result print(message) return None
[docs] def learning_curve(self, figsize=(10, 5)): """Draw a learning curve of the cross-validation. Args: figsize (tupe): Figure size of the chart. """ if len(self.cvr) == 0: return 'no models trained, no learning curves.' plt.figure(figsize=figsize) plt.plot(self.cvr[self.cvr.columns[1]], label='test') plt.plot(self.cvr[self.cvr.columns[3]], label='train') plt.title('learning curve') plt.xlabel('number of rounds') plt.ylabel(self.params['loss_function']) plt.legend(loc='upper right', title='dataset') plt.grid() plt.show() return None
[docs] def report(self): """Report for the binary classification task. """ try: from gossipcat.lab.Report import Visual except Exception as e: print('[WARNING] Package GossipCat not installed.') try: from Report import Visual except Exception as e: return '[ERROR] Package Report not installed.' test_target = self.df[self.target] prob = self.prediction['prob'] plt.figure(figsize=(6, 5.5)) self.prediction['prob'].hist(bins=100) plt.title('distribution of predictions') vis = Visual(test_target=test_target, test_predprob=prob) vis.combo() self.df_cap = vis.df_cap return None
if __name__ == '__main__': main()