Finding Optimum XGBoost Parameters for Tabular For Homesite Competition
Building on the work of the previous notebook, which referenced Zach's notebook to apply his techniques for permutation importance and ensemble learning to the Homesite Competition problem set, we will deep dive into optimizing the XGBoost parameters to see if an improved model can be generated, and what effect this has for our resulting ensemble model's predictions
- Introduction
- Setup
- The GridsearchCV functions
- Exploring single parameter tuning performance
- Hypothesis: Multi-parameter vs Single parameter tuning
- Head to Head Comparisons: Default, vs Recommended vs Max parameter values
Introduction
Using the code here we will look at the optimal number of estimators, depth, and then combination of the two when using XGBoost to generate a model
Notes:
- Possible improvements if manually coding would be to add
subsamples
as a variable, however with both the increase of permutations and the long running times for GridSearch and a look to explore other options like Optuna and Bayesian Search to automate this, I limited it to justmax_depth
andlearning_rate
as the additional variables for now - Because of timeouts for long ranges, broke up the ranges for the various parameters into groupings, will only explore a subsequent grouping if there is an optimal value that is max in a previous grouping. This will also give a more manageable range for the permutations when trying to find the best combination of all parameters
- Added to tune learning_rate based on this code
- Removed the
triage
variable,FillMissing
will handle NA values in categorical fields appropriately, but just need to filter it out from modifying categories function - Changed the categorize functions from last notebook to exclude any columns in y_names from being evaluated since these shouldn't be part of the model training as a parameter
- Added code to save all models to reuse in another notebook that submits to Kaggle for test evaluation
Adding based on tutorial for notebooks
%matplotlib inline
%reload_ext autoreload
%autoreload 2
!pip install -Uqq fastai
!pip install kaggle
from fastai.tabular.all import *
global gdrive #colab only code block
gdrive = Path('/content/gdrive/My Drive')
from google.colab import drive
if not gdrive.exists(): drive.mount(str(gdrive.parent))
!mkdir -p ~/.kaggle
!cp /content/gdrive/MyDrive/Kaggle/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
from kaggle import api
path = Path.cwd()
path.ls()
Only run the next three lines the first time if in a local repository. This will prevent large training data files and model files being checked into Github
!touch .gitignore
!echo "_data" > .gitignore
!mkdir _data
os.chdir('_data')
Path.cwd()
Back to it
os.chdir(path/"gdrive/MyDrive/Kaggle/") # colab only code
Path.cwd()
path = Path.cwd()/"homesite_competition_data"
path.mkdir(exist_ok=True)
Path.BASE_PATH = path
api.competition_download_cli('homesite-quote-conversion', path=path)
file_extract(path/"homesite-quote-conversion.zip")
file_extract(path/"train.csv.zip")
file_extract(path/"test.csv.zip")
path.ls()
Settings
test_size = 0.3
y_block=CategoryBlock()
# n_estimators = [50, 100, 150, 200]
# max_depth = [2, 4, 6, 8]
n_estimators_range = range(50,500,50)
n_estimators_range1 = range(500,1050,50)
n_estimators_range2 = range(1050,1500,50)
n_estimators_range3 = range(1550,2050,50)
max_depth_range = range(1, 12, 2)
max_depth_range1 = range(13,22,2)
max_depth_range2 = range(23,32,2)
learning_rate_range = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
subsample_range = [0.1,0.3,0.5,0.7,1.0]
sampling_methods = ['uniform','gradient_based']
random_seed =42
n_splits = 10 # 10 folds
scoring = "roc_auc"
category_threshold = 20
from sklearn.metrics import roc_auc_score
# valid_score = roc_auc_score(to_np(targs), to_np(preds[:,1]))
# valid_score
GridsearchCV functions
Theimport xgboost as xgb
import matplotlib
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot
This function will tune for n_estimators
only
Notes
- Used https://scikit-learn.org/stable/modules/model_evaluation.html#scoring to find correct
scoring
string - Reading more on StratifiedKFold to understand if I used the
n_splits
correctly, I chose 10 based on this article
def xgboost_tune_estimators(X, y, n_estimators_range, n_splits, sampling_methods, random_seed, scoring):
matplotlib.use('Agg')
label_encoded_y = LabelEncoder().fit_transform(y)
# grid search
model = xgb.XGBClassifier(tree_method='gpu_hist', gpu_id=0, verbosity=2, sampling_methods=sampling_methods)
param_grid = dict(n_estimators=n_estimators_range)
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_seed)
grid_search = GridSearchCV(model, param_grid, scoring=scoring, n_jobs=-1, cv=kfold, verbose=4)
grid_result = grid_search.fit(X, label_encoded_y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
# plot
pyplot.errorbar(n_estimators_range, means, yerr=stds)
pyplot.title("XGBoost n_estimators vs Log Loss")
pyplot.xlabel('n_estimators')
pyplot.ylabel('Log Loss')
pyplot.savefig('n_estimators.png')
This function will tune for max_depth
value only
def xgboost_tune_max_depth(X,y, max_depth_range, n_splits, sampling_methods, random_seed, scoring):
matplotlib.use('Agg')
# encode string class values as integers
label_encoded_y = LabelEncoder().fit_transform(y)
# grid search
model = xgb.XGBClassifier(tree_method='gpu_hist', gpu_id=0, verbosity=2, sampling_methods=sampling_methods)
print(max_depth_range)
param_grid = dict(max_depth=max_depth_range)
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_seed)
grid_search = GridSearchCV(model, param_grid, scoring=scoring, n_jobs=-1, cv=kfold, verbose=4)
grid_result = grid_search.fit(X, label_encoded_y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
# plot
pyplot.errorbar(max_depth_range, means, yerr=stds)
pyplot.title("XGBoost max_depth vs Log Loss")
pyplot.xlabel('max_depth')
pyplot.ylabel('Log Loss')
pyplot.savefig('max_depth.png')
This function will tune for learning_rate
def xgboost_tune_lr(X, y, lr_range, n_splits, sampling_methods, random_seed, scoring):
matplotlib.use('Agg')
label_encoded_y = LabelEncoder().fit_transform(y)
# grid search
model = xgb.XGBClassifier(tree_method='gpu_hist', gpu_id=0, verbosity=2, sampling_methods=sampling_methods)
param_grid = dict(learning_rate=lr_range)
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_seed)
grid_search = GridSearchCV(model, param_grid, scoring=scoring, n_jobs=-1, cv=kfold, verbose=4)
grid_result = grid_search.fit(X, label_encoded_y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
# plot
pyplot.errorbar(lr_range, means, yerr=stds)
pyplot.title("XGBoost learning_rate vs Log Loss")
pyplot.xlabel('learning_rate')
pyplot.ylabel('Log Loss')
pyplot.savefig('learning_rate.png')
This function will tune for n_estimators
, max_depth
, learning_rate
in combination (takes really long to run)
def xgboost_tune_n_estimators_and_max_depth_and_lr(X, y, n_estimators_range, max_depth_range, lr_range, n_splits, sampling_methods, random_seed, scoring):
matplotlib.use('Agg')
# encode string class values as integers
label_encoded_y = LabelEncoder().fit_transform(y)
# grid search
model = xgb.XGBClassifier(tree_method='gpu_hist', gpu_id=0, verbosity=2, sampling_methods=sampling_methods)
print(n_estimators_range)
print(max_depth_range)
print(lr_range)
param_grid = dict(max_depth=max_depth_range, n_estimators=n_estimators_range, learning_rate=lr_range)
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_seed)
grid_search = GridSearchCV(model, param_grid, scoring=scoring, n_jobs=-1, cv=kfold, verbose=4)
grid_result = grid_search.fit(X, label_encoded_y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
# plot results
# scores = np.array(means).reshape(len(max_depth_range), len(n_estimators_range))
# for i, value in enumerate(max_depth_range):
# pyplot.plot(n_estimators_range, scores[i], label='depth: ' + str(value))
# pyplot.legend()
# pyplot.xlabel('n_estimators')
# pyplot.ylabel('Log Loss')
# pyplot.savefig('n_estimators_vs_max_depth.png')
def reassign_to_categorical(field, df, y_names, continuous, categorical):
if ((df[field].isna().sum()==0) and (field not in y_names)):
field_categories = df[field].unique()
df[field] = df[field].astype('category')
df[field].cat.set_categories(field_categories, inplace=True)
if field in continuous: continuous.remove(field)
if field not in categorical: categorical.append(field)
return df, continuous, categorical
def categorize( df, y_names, cont_names, cat_names, category_threshold=50):
for field in df.columns:
if ((len(df[field].unique()) <= category_threshold) and (type(df[field].dtype) != pd.core.dtypes.dtypes.CategoricalDtype)):
reassign_to_categorical(field, df, y_names, cont_names, cat_names)
return df, cont_names, cat_names
def homesite_prep(df_train, df_test, y_names, category_threshold):
df_train.QuoteConversion_Flag = df_train.QuoteConversion_Flag.astype(dtype='boolean')
df_train = df_train.set_index('QuoteNumber')
df_test = df_test.set_index('QuoteNumber')
df_train['Original_Quote_Date'] = pd.to_datetime(df_train['Original_Quote_Date'])
df_test['Original_Quote_Date'] = pd.to_datetime(df_test['Original_Quote_Date'])
df_train = add_datepart(df_train, 'Original_Quote_Date')
df_test = add_datepart(df_test, 'Original_Quote_Date')
cont_names, cat_names = cont_cat_split(df_train, dep_var=y_names)
df_train, cont_names, cat_names = categorize(df_train, y_names, cont_names, cat_names, category_threshold)
return df_train, df_test, cont_names, cat_names
def find_y_columns(df_train, df_test):
y_columns = df_train.columns.difference(df_test.columns)
return y_columns
df_train = pd.read_csv(path/"train.csv", low_memory=False)
df_train.head(2)
df_test = pd.read_csv(path/"test.csv", low_memory=False)
df_test.head(2)
y_names = find_y_columns(df_train, df_test)[0]
df_train, df_test, cont_names, cat_names = homesite_prep(df_train, df_test, y_names, category_threshold)
procs = [Categorify, FillMissing, Normalize]
splits = TrainTestSplitter(test_size=test_size, stratify=df_train[y_names])(df_train)
to = TabularPandas(df=df_train, procs=procs, cat_names=cat_names,
cont_names=cont_names, y_names=y_names,splits=splits,
y_block=y_block)
sampling_methods[0], sampling_methods[1]
%time xgboost_tune_estimators(to.xs, to.ys.values.ravel(), n_estimators_range, n_splits, sampling_methods[1], random_seed, scoring)
%time xgboost_tune_estimators(to.xs, to.ys.values.ravel(), n_estimators_range1, n_splits, sampling_methods[1], random_seed, scoring)
%time xgboost_tune_estimators(to.xs, to.ys.values.ravel(), n_estimators_range2, n_splits, sampling_methods[1], random_seed, scoring)
%time xgboost_tune_estimators(to.xs, to.ys.values.ravel(), n_estimators_range3, n_splits, sampling_methods[1], random_seed, scoring)
From the batches we got from 0.964941 (0.000612) with: {'n_estimators': 450}
to 0.966902 (0.000735) with: {'n_estimators': 1950}
just tuning n_estimators
%time xgboost_tune_max_depth(to.xs,to.ys.values.ravel(), max_depth_range, n_splits, sampling_methods[1], random_seed, scoring)
%time xgboost_tune_max_depth(to.xs,to.ys.values.ravel(), max_depth_range1, n_splits, sampling_methods[1], random_seed, scoring)
With our two range checks, it seems like our best results come with 0.965565 (0.000880) with: {'max_depth': 11}
and seems to get worse after that, since 0.965149 (0.000994) with: {'max_depth': 13}
in the 2nd range checked is less than the best in the initial range chechked, and progressively gets worse. So we should do our final fine tuning of permutations of variables only with the first range for max_depth
and skip tuning with the final range of max_depth
%time xgboost_tune_lr(to.xs,to.ys.values.ravel(), learning_rate_range, n_splits, sampling_methods[1], random_seed, scoring)
The best learning rate, if just tuning that alone is Best: 0.963524 using {'learning_rate': 0.3}
n_estimators_range[0:1], n_estimators_range[1:4], n_estimators_range[4:8]
n_estimators_agg_range = (n_estimators_range[0], n_estimators_range3[-1], 50)
n_estimators_agg_range
Hypothesis: We don't need as large a number of n_estimators
to get at least as good as a validation metric as when we had a large value for n_estimtors
Need to split up the ranges more to not timeout the run in Kaggle. Will stop 1 range after we get a validation metric equal to the n_estimators=1950
value
%time xgboost_tune_n_estimators_and_max_depth_and_lr(to.train.xs, to.train.ys.values.ravel(), n_estimators_range[0:1], max_depth_range, learning_rate_range, n_splits, sampling_methods[1], random_seed, scoring)
Best: 0.964010 using {'learning_rate': 0.2, 'max_depth': 9, 'n_estimators': 50}
to our comparitor score for this range of 0.964941 (0.000612) with: {'n_estimators': 450}
is already pretty close, but still isn't close enough to 0.966902 (0.000735) with: {'n_estimators': 1950}
%time xgboost_tune_n_estimators_and_max_depth_and_lr(to.xs, to.ys.values.ravel(), n_estimators_range[1:2], max_depth_range, learning_rate_range, n_splits, sampling_methods[1], random_seed, scoring)
Best: 0.965491 using {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 100}
to our running best of : Best: 0.964010 using {'learning_rate': 0.2, 'max_depth': 9, 'n_estimators': 50}
is a good step up
to our comparitor score for this range of 0.964941 (0.000612) with: {'n_estimators': 450}
is already better
to our individual parameter tuning best of 0.966902 (0.000735) with: {'n_estimators': 1950}
is not as good yet
%time xgboost_tune_n_estimators_and_max_depth_and_lr(to.xs, to.ys.values.ravel(), n_estimators_range[2:3], max_depth_range, learning_rate_range, n_splits, sampling_methods[1], random_seed, scoring)
Best: 0.965929 using {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 150}
to our running best of : 0.965491 using {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 100}
is slightly better
to our comparitor score for this range of 0.964941 (0.000612) with: {'n_estimators': 450}
is better
to our individual parameter tuning best of 0.966902 (0.000735) with: {'n_estimators': 1950}
is not as good yet
%time xgboost_tune_n_estimators_and_max_depth_and_lr(to.xs, to.ys.values.ravel(), n_estimators_range[3:4], max_depth_range, learning_rate_range, n_splits, sampling_methods[1], random_seed, scoring)
Best: 0.966138 using {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200}
to our running best of : 0.965929 using {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 150}
is better
to our comparitor score for this range of 0.964941 (0.000612) with: {'n_estimators': 450}
is better
to our individual parameter tuning best of 0.966902 (0.000735) with: {'n_estimators': 1950}
is just slightly worse off
%time xgboost_tune_n_estimators_and_max_depth_and_lr(to.xs, to.ys.values.ravel(), n_estimators_range[4:5], max_depth_range, learning_rate_range, n_splits, sampling_methods[1], random_seed, scoring)
Best: 0.966389 using {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 250}
to our running best of : 0.966138 using {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200}
is slightly better
to our comparitor score for this range of 0.964941 (0.000612) with: {'n_estimators': 450}
is much better
to our individual parameter tuning best of 0.966902 (0.000735) with: {'n_estimators': 1950}
is not yet better
%time xgboost_tune_n_estimators_and_max_depth_and_lr(to.xs, to.ys.values.ravel(), n_estimators_range[5:6], max_depth_range, learning_rate_range, n_splits, sampling_methods[1], random_seed, scoring)
Best: 0.966422 using {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 300}
to our running best of : 0.966389 using {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 250}
is slightly better
to our comparitor score for this range of 0.964941 (0.000612) with: {'n_estimators': 450}
is much better
to our individual parameter tuning best of 0.966902 (0.000735) with: {'n_estimators': 1950}
is not quite better
%time xgboost_tune_n_estimators_and_max_depth_and_lr(to.xs, to.ys.values.ravel(), n_estimators_range[6:7], max_depth_range, learning_rate_range, n_splits, sampling_methods[1], random_seed, scoring)
Best: 0.966364 using {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 350}
to our running best of : 0.966422 using {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 300}
is getting worse
to our comparitor score for this range of 0.964941 (0.000612) with: {'n_estimators': 450}
is much better
to our individual parameter tuning best of 0.966902 (0.000735) with: {'n_estimators': 1950}
is not quite better
%time xgboost_tune_n_estimators_and_max_depth_and_lr(to.xs, to.ys.values.ravel(), n_estimators_range[7:8], max_depth_range, learning_rate_range, n_splits, sampling_methods[1], random_seed, scoring)
Best: 0.966493 using {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 400}
to our running best of : 0.966422 using {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 300}
is a very slight improvement
to our comparitor score for this range of 0.964941 (0.000612) with: {'n_estimators': 450}
is better
to our individual parameter tuning best of 0.966902 (0.000735) with: {'n_estimators': 1950}
is not as good
As we can see the most optimal parameters using XGBoost would be to do n_estimators=300
, max_depth=5
, learning_rate=0.2
. Using default values of XGBoost from it's source code, Let's run both and compare results side by side
%time xgboost_tune_n_estimators_and_max_depth_and_lr(to.xs, to.ys.values.ravel(), n_estimators_range[8:9], max_depth_range, learning_rate_range, n_splits, sampling_methods[1], random_seed, scoring)
Best: 0.966590 using {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 450}
to our running best of : 0.966493 using {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 400}
is slightly better
to our comparitor score for this range of 0.964941 (0.000612) with: {'n_estimators': 450}
is better
to our individual parameter tuning best of 0.966902 (0.000735) with: {'n_estimators': 1950}
not quite as good
Ending here for now, as the run times were just getting too long, and we're really really close to comparative performance with the best single-parameter tuned result
n_estimators_original = 100
max_depth_original = 6
learning_rate_original = 0.2
n_estimators_recommended = 450
max_depth_recommended = 5
learning_rate_recommended = 0.1
n_estimators_max = 1950
subsample = 1
enable_categorical=True
X_train, y_train = to.train.xs, to.train.ys.values.ravel()
X_valid, y_valid = to.valid.xs, to.valid.ys.values.ravel()
model_original = xgb.XGBClassifier(n_estimators = n_estimators_original, max_depth=max_depth_original, learning_rate=learning_rate_original, subsample=subsample,
tree_method='gpu_hist', gpu_id=0, verbosity=2, enable_categorical=enable_categorical, sampling_methods=sampling_methods[1])
%time xgb_model_original = model_original.fit(X_train, y_train)
xgb_preds_original = xgb_model_original.predict_proba(X_valid)
xgb_preds_original
model_recommended = xgb.XGBClassifier(n_estimators = n_estimators_recommended, max_depth=max_depth_recommended, learning_rate=learning_rate_recommended, subsample=subsample,
tree_method='gpu_hist', gpu_id=0, verbosity=3, enable_categorical=enable_categorical, sampling_methods=sampling_methods[1])
%time xgb_model_recommended = model_recommended.fit(X_train, y_train)
xgb_preds_recommended = xgb_model_recommended.predict_proba(X_valid)
xgb_preds_recommended
model_max_e = xgb.XGBClassifier(n_estimators = n_estimators_max, max_depth=max_depth_recommended, learning_rate=learning_rate_recommended, subsample=subsample,
tree_method='gpu_hist', gpu_id=0, verbosity=2, enable_categorical=enable_categorical, sampling_methods=sampling_methods[1])
%time xgb_model_max_e = model_max_e.fit(X_train, y_train)
xgb_preds_max_e = xgb_model_max_e.predict_proba(X_valid)
xgb_preds_max_e
accuracy(tensor(xgb_preds_original), tensor(y_valid)), accuracy(tensor(xgb_preds_recommended), tensor(y_valid)), accuracy(tensor(xgb_preds_max_e), tensor(y_valid))
roc_auc_score(y_score=tensor(xgb_preds_original[:,1:2]), y_true=tensor(y_valid)), roc_auc_score(y_score=tensor(xgb_preds_recommended[:,1:2]), y_true=tensor(y_valid)), roc_auc_score(y_score=tensor(xgb_preds_max_e[:,1:2]), y_true=tensor(y_valid))
So as we can see, we get a slightly better performance both in accuracy and the roc_auc_score
(used in Kaggle competition) when we use a lower n_estimators
and tune together with max_depth
and learning_rate
parameters based on the recommendation, than either when we use just the defaults or the maximum value of n_estimators