Import Libraries

!pip install optuna
import pandas as pd
import numpy as np


from sklearn.ensemble import RandomForestClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedKFold


from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_auc_score


from sklearn import preprocessing
from sklearn import model_selection
import sklearn.datasets


import xgboost as xgb
from xgboost import XGBClassifier

import optuna

import matplotlib.pyplot as plt

from fastbook import *
from fastai.tabular.all import *
from dtreeviz.trees import *
from IPython.display import Image, display_svg, SVG
import random as rd



pd.options.display.max_rows = 20
pd.options.display.max_columns = 8

Download Data

!mkdir -p ~/.kaggle
!cp /content/gdrive/MyDrive/Kaggle/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
path = Path('/content/gdrive/MyDrive/Kaggle/' + 'data/homesite-quote')
path.mkdir(parents=True, exist_ok=True)
path
Path('/content/gdrive/MyDrive/Kaggle/data/homesite-quote')
!kaggle competitions download -c homesite-quote-conversion -p /content/gdrive/MyDrive/Kaggle/data/homesite-quote
! unzip -q -n '{path}/train.csv.zip' -d '{path}'
! unzip -q -n '{path}/test.csv.zip' -d '{path}'
! unzip -q -n '{path}/sample_submission.csv.zip' -d '{path}'
df = pd.read_csv(path/'train.csv', low_memory=False)
test_df = pd.read_csv(path/'test.csv', low_memory=False)

EDA with Fastai

dep_var='QuoteConversion_Flag'

As 'QuoteNumber' is unique, set it as index

df_train = df.set_index('QuoteNumber')
df_test = test_df.set_index('QuoteNumber')

Use Fastai function to add relevant datetime fields

df_train['Original_Quote_Date'] = pd.to_datetime(df_train['Original_Quote_Date'])
df_test['Original_Quote_Date'] = pd.to_datetime(df_test['Original_Quote_Date'])
df_train = add_datepart(df_train, 'Original_Quote_Date')
df_test = add_datepart(df_test, 'Original_Quote_Date')

Drop 2 below fields because they have constant values

df_train.drop(columns=['PropertyField6','GeographicField10A'],axis=1,inplace=True)
df_test.drop(columns=['PropertyField6','GeographicField10A'],axis=1,inplace=True)

Use Fastai function to identify continuous and categorical variables

cont_names, cat_names = cont_cat_split(df_train,dep_var=dep_var)
len(cont_names), len(cat_names)
(155, 152)

'procs' will take care of of categorifying categorical variables, fill in missing values and normalise data

procs = [Categorify, FillMissing, Normalize]
splits = TrainTestSplitter(test_size=0.2, stratify=df_train[dep_var])(df_train)

Create a TabularPandas dataset

to = TabularPandas(df=df_train, procs=procs, cat_names=cat_names, 
                   cont_names=cont_names, y_names=dep_var,splits=splits,
                  y_block=CategoryBlock())
dls = to.dataloaders(bs=4096, val_bs=512, layers=[10000,500], embed_ps=0.02, ps=[0.001, 0.01])

XGBoost with Otuna

Use Optuna to select best hyperparamters for XGboost model. Code is referenced from https://www.kaggle.com/hamzaghanmi/xgboost-hyperparameter-tuning-using-optuna

X_train_fa, y_train_fa = to.train.xs, to.train.ys.values.ravel()
X_valid_fa, y_valid_fa = to.valid.xs, to.valid.ys.values.ravel()

Define parameter to test

def objective(trial):
    
    X_train_fa, y_train_fa = to.train.xs, to.train.ys.values.ravel()
    X_valid_fa, y_valid_fa = to.valid.xs, to.valid.ys.values.ravel()
    param = {
        'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.009,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': 1000,
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17,20]),
        'random_state': trial.suggest_categorical('random_state', [24, 48,2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }
    model = xgb.XGBClassifier(**param)  
    
    model.fit(X_train_fa,y_train_fa,eval_set=[(X_valid_fa,y_valid_fa)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict_proba(X_valid_fa)[:,1]
    
    auc = roc_auc_score(y_valid_fa, preds)
    
    return auc

Fit model using Optuna

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
[I 2021-07-03 13:18:30,783] A new study created in memory with name: no-name-fe83554f-1bcf-4f1e-976a-1d2d12ba22da
[I 2021-07-03 13:18:54,840] Trial 0 finished with value: 0.9614490168531203 and parameters: {'lambda': 0.6618680618471974, 'alpha': 0.04239837985417904, 'colsample_bytree': 1.0, 'subsample': 0.5, 'learning_rate': 0.01, 'max_depth': 7, 'random_state': 48, 'min_child_weight': 253}. Best is trial 0 with value: 0.9614490168531203.
[I 2021-07-03 13:19:22,792] Trial 1 finished with value: 0.96370909267955 and parameters: {'lambda': 8.086230640660201, 'alpha': 0.004463392998945948, 'colsample_bytree': 0.9, 'subsample': 0.4, 'learning_rate': 0.01, 'max_depth': 11, 'random_state': 24, 'min_child_weight': 81}. Best is trial 1 with value: 0.96370909267955.
[I 2021-07-03 13:20:05,297] Trial 2 finished with value: 0.9663443798690221 and parameters: {'lambda': 0.06952864689008562, 'alpha': 0.03710247783593982, 'colsample_bytree': 0.4, 'subsample': 1.0, 'learning_rate': 0.014, 'max_depth': 15, 'random_state': 48, 'min_child_weight': 44}. Best is trial 2 with value: 0.9663443798690221.
[I 2021-07-03 13:20:33,443] Trial 3 finished with value: 0.9659511091468089 and parameters: {'lambda': 0.1750300039236326, 'alpha': 0.16170199753683476, 'colsample_bytree': 0.6, 'subsample': 1.0, 'learning_rate': 0.016, 'max_depth': 13, 'random_state': 48, 'min_child_weight': 66}. Best is trial 2 with value: 0.9663443798690221.
[I 2021-07-03 13:21:05,326] Trial 4 finished with value: 0.9659325248764232 and parameters: {'lambda': 0.01680518019972616, 'alpha': 0.00967393286707375, 'colsample_bytree': 0.8, 'subsample': 0.7, 'learning_rate': 0.018, 'max_depth': 17, 'random_state': 2020, 'min_child_weight': 30}. Best is trial 2 with value: 0.9663443798690221.
[I 2021-07-03 13:21:29,690] Trial 5 finished with value: 0.9655972055590062 and parameters: {'lambda': 0.020530533816826284, 'alpha': 1.1760096602282823, 'colsample_bytree': 1.0, 'subsample': 1.0, 'learning_rate': 0.02, 'max_depth': 7, 'random_state': 2020, 'min_child_weight': 44}. Best is trial 2 with value: 0.9663443798690221.
[I 2021-07-03 13:21:35,195] Trial 6 finished with value: 0.9349120024842033 and parameters: {'lambda': 0.42768461846209, 'alpha': 1.353454042817887, 'colsample_bytree': 0.9, 'subsample': 0.6, 'learning_rate': 0.009, 'max_depth': 5, 'random_state': 2020, 'min_child_weight': 78}. Best is trial 2 with value: 0.9663443798690221.
[I 2021-07-03 13:21:40,078] Trial 7 finished with value: 0.9430147359254457 and parameters: {'lambda': 2.5542913283137563, 'alpha': 1.013512477448444, 'colsample_bytree': 0.4, 'subsample': 0.7, 'learning_rate': 0.016, 'max_depth': 5, 'random_state': 2020, 'min_child_weight': 208}. Best is trial 2 with value: 0.9663443798690221.
[I 2021-07-03 13:22:17,721] Trial 8 finished with value: 0.9633637626415875 and parameters: {'lambda': 0.018688998050368725, 'alpha': 0.17392736957556704, 'colsample_bytree': 0.6, 'subsample': 0.5, 'learning_rate': 0.014, 'max_depth': 20, 'random_state': 48, 'min_child_weight': 220}. Best is trial 2 with value: 0.9663443798690221.
[I 2021-07-03 13:23:01,548] Trial 9 finished with value: 0.9652223224163096 and parameters: {'lambda': 0.02137093874894116, 'alpha': 0.15451044816007484, 'colsample_bytree': 0.3, 'subsample': 0.6, 'learning_rate': 0.016, 'max_depth': 20, 'random_state': 48, 'min_child_weight': 117}. Best is trial 2 with value: 0.9663443798690221.
[I 2021-07-03 13:24:15,020] Trial 10 finished with value: 0.9661774617228225 and parameters: {'lambda': 0.001620082617449181, 'alpha': 8.552250413676935, 'colsample_bytree': 0.4, 'subsample': 0.8, 'learning_rate': 0.012, 'max_depth': 15, 'random_state': 24, 'min_child_weight': 7}. Best is trial 2 with value: 0.9663443798690221.
[I 2021-07-03 13:25:49,103] Trial 11 finished with value: 0.9660047912737255 and parameters: {'lambda': 0.0014240237450165209, 'alpha': 8.559648869065919, 'colsample_bytree': 0.4, 'subsample': 0.8, 'learning_rate': 0.012, 'max_depth': 15, 'random_state': 24, 'min_child_weight': 4}. Best is trial 2 with value: 0.9663443798690221.
[I 2021-07-03 13:26:50,907] Trial 12 finished with value: 0.9654791512167799 and parameters: {'lambda': 0.002057460573404571, 'alpha': 0.0010463310681730537, 'colsample_bytree': 0.5, 'subsample': 0.8, 'learning_rate': 0.008, 'max_depth': 15, 'random_state': 24, 'min_child_weight': 6}. Best is trial 2 with value: 0.9663443798690221.
[I 2021-07-03 13:27:19,201] Trial 13 finished with value: 0.9646930333214024 and parameters: {'lambda': 0.0038974148954420747, 'alpha': 0.02630961934977218, 'colsample_bytree': 0.7, 'subsample': 0.8, 'learning_rate': 0.012, 'max_depth': 9, 'random_state': 24, 'min_child_weight': 144}. Best is trial 2 with value: 0.9663443798690221.
[I 2021-07-03 13:28:28,550] Trial 14 finished with value: 0.9657240892698717 and parameters: {'lambda': 0.07619038564231667, 'alpha': 8.897569356075648, 'colsample_bytree': 0.4, 'subsample': 1.0, 'learning_rate': 0.014, 'max_depth': 15, 'random_state': 48, 'min_child_weight': 4}. Best is trial 2 with value: 0.9663443798690221.
[I 2021-07-03 13:29:00,356] Trial 15 finished with value: 0.9641639083366684 and parameters: {'lambda': 0.005467248640780835, 'alpha': 0.002206643068223733, 'colsample_bytree': 0.4, 'subsample': 0.4, 'learning_rate': 0.014, 'max_depth': 15, 'random_state': 24, 'min_child_weight': 116}. Best is trial 2 with value: 0.9663443798690221.
[I 2021-07-03 13:29:49,551] Trial 16 finished with value: 0.9664961781588531 and parameters: {'lambda': 0.08287684030183871, 'alpha': 0.021800136799959794, 'colsample_bytree': 0.4, 'subsample': 1.0, 'learning_rate': 0.012, 'max_depth': 15, 'random_state': 48, 'min_child_weight': 44}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:30:19,110] Trial 17 finished with value: 0.9656477756261004 and parameters: {'lambda': 0.0736986353742997, 'alpha': 0.017914630395936607, 'colsample_bytree': 0.7, 'subsample': 1.0, 'learning_rate': 0.018, 'max_depth': 11, 'random_state': 48, 'min_child_weight': 166}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:30:58,831] Trial 18 finished with value: 0.9661479134448582 and parameters: {'lambda': 0.20528790013698178, 'alpha': 0.05696439914463783, 'colsample_bytree': 0.3, 'subsample': 1.0, 'learning_rate': 0.02, 'max_depth': 17, 'random_state': 48, 'min_child_weight': 49}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:31:35,970] Trial 19 finished with value: 0.9654636560790603 and parameters: {'lambda': 1.8293301690601609, 'alpha': 0.0079463512796583, 'colsample_bytree': 0.5, 'subsample': 1.0, 'learning_rate': 0.009, 'max_depth': 13, 'random_state': 48, 'min_child_weight': 109}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:32:03,872] Trial 20 finished with value: 0.9634307014917336 and parameters: {'lambda': 0.05935031259030531, 'alpha': 0.37572455565093704, 'colsample_bytree': 0.8, 'subsample': 1.0, 'learning_rate': 0.008, 'max_depth': 9, 'random_state': 48, 'min_child_weight': 180}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:32:58,662] Trial 21 finished with value: 0.966360010156296 and parameters: {'lambda': 0.007760285328621599, 'alpha': 0.053988600576792355, 'colsample_bytree': 0.4, 'subsample': 0.8, 'learning_rate': 0.012, 'max_depth': 15, 'random_state': 24, 'min_child_weight': 24}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:33:43,499] Trial 22 finished with value: 0.9661228311356425 and parameters: {'lambda': 0.006653218220222155, 'alpha': 0.06371101614572536, 'colsample_bytree': 0.4, 'subsample': 0.8, 'learning_rate': 0.012, 'max_depth': 15, 'random_state': 48, 'min_child_weight': 28}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:34:23,905] Trial 23 finished with value: 0.9658870820455391 and parameters: {'lambda': 0.03774935662196809, 'alpha': 0.019484399312365754, 'colsample_bytree': 0.4, 'subsample': 1.0, 'learning_rate': 0.012, 'max_depth': 15, 'random_state': 24, 'min_child_weight': 92}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:34:49,917] Trial 24 finished with value: 0.960692411034934 and parameters: {'lambda': 0.22050638749514634, 'alpha': 0.3497176109339869, 'colsample_bytree': 0.4, 'subsample': 0.4, 'learning_rate': 0.012, 'max_depth': 15, 'random_state': 48, 'min_child_weight': 298}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:35:33,230] Trial 25 finished with value: 0.9661802371154521 and parameters: {'lambda': 0.008553754604813191, 'alpha': 0.010383443637253126, 'colsample_bytree': 0.4, 'subsample': 0.8, 'learning_rate': 0.014, 'max_depth': 15, 'random_state': 48, 'min_child_weight': 56}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:36:21,724] Trial 26 finished with value: 0.9663990659640549 and parameters: {'lambda': 0.03785620605363892, 'alpha': 0.00366817865425139, 'colsample_bytree': 0.4, 'subsample': 1.0, 'learning_rate': 0.012, 'max_depth': 15, 'random_state': 24, 'min_child_weight': 28}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:37:09,025] Trial 27 finished with value: 0.9659814067807833 and parameters: {'lambda': 0.011887161277513221, 'alpha': 0.0022334644633789213, 'colsample_bytree': 0.4, 'subsample': 0.5, 'learning_rate': 0.012, 'max_depth': 15, 'random_state': 24, 'min_child_weight': 25}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:37:39,925] Trial 28 finished with value: 0.9657993844652173 and parameters: {'lambda': 0.03269086093271246, 'alpha': 0.002591116596186556, 'colsample_bytree': 0.4, 'subsample': 0.6, 'learning_rate': 0.012, 'max_depth': 9, 'random_state': 24, 'min_child_weight': 23}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:38:06,678] Trial 29 finished with value: 0.9632682107001242 and parameters: {'lambda': 0.0028161256757211007, 'alpha': 0.0010181643728821939, 'colsample_bytree': 1.0, 'subsample': 0.7, 'learning_rate': 0.01, 'max_depth': 7, 'random_state': 24, 'min_child_weight': 99}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:38:29,798] Trial 30 finished with value: 0.9638549962767746 and parameters: {'lambda': 0.5323894003316153, 'alpha': 0.005644746026283066, 'colsample_bytree': 0.9, 'subsample': 1.0, 'learning_rate': 0.012, 'max_depth': 15, 'random_state': 24, 'min_child_weight': 129}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:39:10,746] Trial 31 finished with value: 0.9662165573514324 and parameters: {'lambda': 0.1265430806657861, 'alpha': 0.03607744523557369, 'colsample_bytree': 0.4, 'subsample': 1.0, 'learning_rate': 0.014, 'max_depth': 15, 'random_state': 24, 'min_child_weight': 67}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:39:41,841] Trial 32 finished with value: 0.965805495155772 and parameters: {'lambda': 0.03962702582341804, 'alpha': 0.06657948718332439, 'colsample_bytree': 0.4, 'subsample': 1.0, 'learning_rate': 0.012, 'max_depth': 11, 'random_state': 48, 'min_child_weight': 42}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:40:22,974] Trial 33 finished with value: 0.9658125989542109 and parameters: {'lambda': 0.3263042905584334, 'alpha': 0.014621966829258828, 'colsample_bytree': 0.6, 'subsample': 1.0, 'learning_rate': 0.01, 'max_depth': 15, 'random_state': 24, 'min_child_weight': 76}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:40:59,992] Trial 34 finished with value: 0.9660552117109559 and parameters: {'lambda': 0.12103628446814962, 'alpha': 0.0356361328795397, 'colsample_bytree': 0.4, 'subsample': 1.0, 'learning_rate': 0.012, 'max_depth': 13, 'random_state': 48, 'min_child_weight': 58}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:41:22,886] Trial 35 finished with value: 0.9652533525125997 and parameters: {'lambda': 0.05215091371717967, 'alpha': 0.11395752205090479, 'colsample_bytree': 0.8, 'subsample': 0.5, 'learning_rate': 0.018, 'max_depth': 17, 'random_state': 24, 'min_child_weight': 37}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:41:46,007] Trial 36 finished with value: 0.9653301078058069 and parameters: {'lambda': 0.8300674259125883, 'alpha': 0.0038249812316278288, 'colsample_bytree': 1.0, 'subsample': 0.4, 'learning_rate': 0.02, 'max_depth': 7, 'random_state': 2020, 'min_child_weight': 16}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:41:50,974] Trial 37 finished with value: 0.9411099055288206 and parameters: {'lambda': 0.010576441432516159, 'alpha': 0.2553780859410673, 'colsample_bytree': 0.9, 'subsample': 1.0, 'learning_rate': 0.009, 'max_depth': 5, 'random_state': 48, 'min_child_weight': 81}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:42:35,777] Trial 38 finished with value: 0.9662987439673464 and parameters: {'lambda': 0.023907742250035224, 'alpha': 0.0964642360899176, 'colsample_bytree': 0.4, 'subsample': 0.7, 'learning_rate': 0.014, 'max_depth': 15, 'random_state': 2020, 'min_child_weight': 41}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:43:12,544] Trial 39 finished with value: 0.965898889531139 and parameters: {'lambda': 0.013747145139813862, 'alpha': 0.01254077864827821, 'colsample_bytree': 0.3, 'subsample': 1.0, 'learning_rate': 0.016, 'max_depth': 15, 'random_state': 48, 'min_child_weight': 65}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:43:43,219] Trial 40 finished with value: 0.9651608836703399 and parameters: {'lambda': 0.12057384146600342, 'alpha': 0.006354843087402957, 'colsample_bytree': 0.6, 'subsample': 0.6, 'learning_rate': 0.012, 'max_depth': 11, 'random_state': 24, 'min_child_weight': 93}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:44:20,814] Trial 41 finished with value: 0.9658953671958843 and parameters: {'lambda': 0.023312326888009435, 'alpha': 0.09590285438044613, 'colsample_bytree': 0.4, 'subsample': 0.7, 'learning_rate': 0.014, 'max_depth': 15, 'random_state': 2020, 'min_child_weight': 42}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:44:34,794] Trial 42 finished with value: 0.9581546016680716 and parameters: {'lambda': 0.03005460208149567, 'alpha': 0.025891473088036694, 'colsample_bytree': 0.4, 'subsample': 0.7, 'learning_rate': 0.014, 'max_depth': 20, 'random_state': 2020, 'min_child_weight': 19}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:45:16,910] Trial 43 finished with value: 0.9660153884468001 and parameters: {'lambda': 0.017027501529365368, 'alpha': 0.0926033448201442, 'colsample_bytree': 0.4, 'subsample': 0.7, 'learning_rate': 0.014, 'max_depth': 15, 'random_state': 2020, 'min_child_weight': 38}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:45:29,673] Trial 44 finished with value: 0.9582815119261712 and parameters: {'lambda': 0.0498652563995772, 'alpha': 0.7044842190007566, 'colsample_bytree': 0.4, 'subsample': 0.7, 'learning_rate': 0.008, 'max_depth': 15, 'random_state': 2020, 'min_child_weight': 14}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:45:51,223] Trial 45 finished with value: 0.9640948903554354 and parameters: {'lambda': 0.09962512320523084, 'alpha': 0.04598224416610351, 'colsample_bytree': 0.5, 'subsample': 0.8, 'learning_rate': 0.014, 'max_depth': 5, 'random_state': 2020, 'min_child_weight': 1}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:46:26,321] Trial 46 finished with value: 0.9657404290923046 and parameters: {'lambda': 0.02585297640370347, 'alpha': 0.16959923307006608, 'colsample_bytree': 0.7, 'subsample': 0.7, 'learning_rate': 0.012, 'max_depth': 15, 'random_state': 2020, 'min_child_weight': 51}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:47:01,541] Trial 47 finished with value: 0.9660348729097518 and parameters: {'lambda': 0.004220389968619446, 'alpha': 0.028392015763314685, 'colsample_bytree': 0.4, 'subsample': 0.8, 'learning_rate': 0.016, 'max_depth': 15, 'random_state': 24, 'min_child_weight': 67}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:47:58,615] Trial 48 finished with value: 0.9660819761494438 and parameters: {'lambda': 0.08026597900346424, 'alpha': 0.11850315623816536, 'colsample_bytree': 0.4, 'subsample': 0.5, 'learning_rate': 0.014, 'max_depth': 20, 'random_state': 48, 'min_child_weight': 32}. Best is trial 16 with value: 0.9664961781588531.
[I 2021-07-03 13:48:31,376] Trial 49 finished with value: 0.9650435738573538 and parameters: {'lambda': 0.007245440807228051, 'alpha': 0.24203698386397063, 'colsample_bytree': 0.4, 'subsample': 1.0, 'learning_rate': 0.018, 'max_depth': 15, 'random_state': 2020, 'min_child_weight': 252}. Best is trial 16 with value: 0.9664961781588531.

Save best trials

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
Number of finished trials: 50
Best trial: {'lambda': 0.08287684030183871, 'alpha': 0.021800136799959794, 'colsample_bytree': 0.4, 'subsample': 1.0, 'learning_rate': 0.012, 'max_depth': 15, 'random_state': 48, 'min_child_weight': 44}
Best_trial_fastai= {'lambda': 0.08287684030183871, 'alpha': 0.021800136799959794, 'colsample_bytree': 0.4, 'subsample': 1.0, 'learning_rate': 0.012, 'max_depth': 15, 'random_state': 48, 'min_child_weight': 44}

Optuna Visualisation

plot_optimization_histor: shows the scores from all trials as well as the best score so far at each point.

optuna.visualization.plot_optimization_history(study)

plot_parallel_coordinate: interactively visualizes the hyperparameters and scores

optuna.visualization.plot_parallel_coordinate(study)

plot_slice: shows the evolution of the search. You can see where in the hyperparameter space your search went and which parts of the space were explored more.

optuna.visualization.plot_slice(study)

plot_contour: plots parameter interactions on an interactive chart. You can choose which hyperparameters you would like to explore.

optuna.visualization.plot_contour(study, params=['alpha',
                            #'max_depth',
                            'lambda',
                            'subsample',
                            'learning_rate',
                            'subsample'])

Visualize parameter importances.

optuna.visualization.plot_param_importances(study)

Visualize empirical distribution function

optuna.visualization.plot_edf(study)

Model Best Optuna Trials

Best_trial_fastai= {'lambda': 0.08287684030183871, 'alpha': 0.021800136799959794, 'colsample_bytree': 0.4, 'subsample': 1.0, 'learning_rate': 0.012, 'max_depth': 15, 'random_state': 48, 'min_child_weight': 44,'n_estimators': 1000,'tree_method':'gpu_hist'}

Using StratifiedKfold cross validation to test roc_auc_score

kf = StratifiedKFold(n_splits=5,random_state=48,shuffle=True)
auc=[]  # list contains auc for each fold
n=0
for trn_idx, test_idx in kf.split(X_train_fa,y_train_fa):
    X_tr,X_val=X_train_fa.iloc[trn_idx],X_train_fa.iloc[test_idx]
    y_tr,y_val=y_train_fa[trn_idx],y_train_fa[test_idx]
    model = xgb.XGBClassifier(**Best_trial_fastai)
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False)
    # preds+=model.predict(test_df[columns])/kf.n_splits
    auc.append(roc_auc_score(y_val, model.predict_proba(X_val)[:,1]))
    print(n+1,auc[n])
    n+=1
1 0.9670692718197905
2 0.9666783942446252
3 0.9624103790810723
4 0.9650869557007921
5 0.9661417297238831

Calculate mean of all folds

np.mean(auc)
0.9654773461140327

Create inference for test set fastai

roc_auc_binary = RocAucBinary()
learn = tabular_learner(dls, metrics=roc_auc_binary)
dl_test = learn.dls.test_dl(df_test.iloc[:])
X_test=dl_test.dataset.xs

Save Kaggle Submission File

preds = model.predict_proba(X_test)[:,1]
sample = pd.read_csv(path/'sample_submission.csv')
sample.QuoteConversion_Flag = preds
sample.to_csv(path/'xgb_optuna_fastaidata.csv', index=False)

Kaggle Score: 0.96633