Permutation Importance and Ensemble Experiments in Fastai Tabular For Homesite Competition
Here I borrow generously from Zach's notebook
Introduction
Learning from WalkWithFastai's lesson on permutation importance and ensemble techniques we apply some of it to the homesite competition data
Notes:
- Changed the categorize functions from last notebook to exclude any columns in y_names from being evaluated since these shouldn't be part of the model training as a parameter
!pip install -Uqq fastai
!pip install kaggle
from fastai.tabular.all import *
from kaggle import api
Path.cwd()
!touch .gitignore
!echo "_data" > .gitignore
!mkdir _data
os.chdir('_data')
Path.cwd()
path = Path.cwd()/"homesite_competition_data"
path.mkdir(exist_ok=True)
Path.BASE_PATH = path
api.competition_download_cli('homesite-quote-conversion', path=path)
file_extract(path/"homesite-quote-conversion.zip")
file_extract(path/"train.csv.zip")
file_extract(path/"test.csv.zip")
path.ls()
random_seed = 42
bs = 4096
val_bs = 512
test_size = 0.3
epochs = 3
lr = 1e-2
wd=0.002
layers = [10000,500]
dropout = [0.001, 0.01]
y_block=CategoryBlock()
emb_dropout=0.02
set_seed(42)
df_train = pd.read_csv(path/"train.csv", low_memory=False)
df_train.head(2)
df_train.shape
df_test = pd.read_csv(path/"test.csv", low_memory=False)
df_test.head(2)
df_test.shape
y_column = df_train.columns.difference(df_test.columns)
y_column
df_train.QuoteConversion_Flag = df_train.QuoteConversion_Flag.astype(dtype='boolean')
train_data_balance = pd.DataFrame(df_train["QuoteConversion_Flag"]).groupby("QuoteConversion_Flag")
train_data_balance["QuoteConversion_Flag"].describe()
Adding Tim's bits of insight
df_train = df_train.set_index('QuoteNumber')
df_test = df_test.set_index('QuoteNumber')
df_train['Original_Quote_Date'] = pd.to_datetime(df_train['Original_Quote_Date'])
df_test['Original_Quote_Date'] = pd.to_datetime(df_test['Original_Quote_Date'])
df_train = add_datepart(df_train, 'Original_Quote_Date')
df_test = add_datepart(df_test, 'Original_Quote_Date')
y_names = [y_column[0]]
cont_names, cat_names = cont_cat_split(df_train, dep_var=y_names)
len(cont_names), len(cat_names)
triage = L()
def reassign_to_categorical(field, df, y_names, continuous, categorical, triage):
if ((df[field].isna().sum()==0) and (field not in y_names)):
field_categories = df[field].unique()
df[field] = df[field].astype('category')
df[field].cat.set_categories(field_categories, inplace=True)
if field in continuous: continuous.remove(field)
if field not in categorical: categorical.append(field)
else:
if field in continuous: continuous.remove(field)
if field in categorical: categorical.remove(field)
triage.append(field)
return df, continuous, categorical, triage
def categorize( df, y_names, cont_names, cat_names, triage, category_threshold):
for field in df.columns:
if ((len(df[field].unique()) <= category_threshold) and (type(df[field].dtype) != pd.core.dtypes.dtypes.CategoricalDtype)):
reassign_to_categorical(field, df, y_names, cont_names, cat_names, triage)
return df, cont_names, cat_names, triage
df_train, cont_names, cat_names, triage = categorize(df_train, y_names, cont_names, cat_names, triage, 100)
"QuoteConversion_Flag" in cont_names, "QuoteConversion_Flag" in cat_names #Make sure we've gotten our y-column excluded
procs = [Categorify, FillMissing, Normalize]
splits = TrainTestSplitter(test_size=test_size, stratify=df_train[y_names])(df_train)
to = TabularPandas(df=df_train, procs=procs, cat_names=cat_names,
cont_names=cont_names, y_names=y_names,splits=splits,
y_block=y_block)
dls = to.dataloaders(bs=bs, val_bs=val_bs, layers=layers, embed_ps=emb_dropout, ps=dropout)
dls.valid.show_batch()
learn = tabular_learner(dls, metrics=accuracy)
learn.lr_find(suggest_funcs=(valley, slide, minimum, steep))
learn.fit_one_cycle(epochs,lr, wd=wd)
preds, targs = learn.get_preds()
accuracy(preds,targs)
class PermutationImportance():
"Calculate and plot the permutation importance"
def __init__(self, learn:Learner, df=None, bs=None):
"Initialize with a test dataframe, a learner, and a metric"
self.learn = learn
self.df = df if df is not None else None
bs = bs if bs is not None else learn.dls.bs
self.dl = learn.dls.test_dl(self.df, bs=bs) if self.df is not None else learn.dls[1]
self.x_names = learn.dls.x_names.filter(lambda x: '_na' not in x)
self.na = learn.dls.x_names.filter(lambda x: '_na' in x)
self.y = dls.y_names
self.results = self.calc_feat_importance()
self.plot_importance(self.ord_dic_to_df(self.results))
def measure_col(self, name:str):
"Measures change after column shuffle"
col = [name]
if f'{name}_na' in self.na: col.append(name)
orig = self.dl.items[col].values
perm = np.random.permutation(len(orig))
self.dl.items[col] = self.dl.items[col].values[perm]
metric = learn.validate(dl=self.dl)[1]
self.dl.items[col] = orig
return metric
def calc_feat_importance(self):
"Calculates permutation importance by shuffling a column on a percentage scale"
print('Getting base error')
base_error = self.learn.validate(dl=self.dl)[1]
self.importance = {}
pbar = progress_bar(self.x_names)
print('Calculating Permutation Importance')
for col in pbar:
self.importance[col] = self.measure_col(col)
for key, value in self.importance.items():
self.importance[key] = (base_error-value)/base_error #this can be adjusted
return OrderedDict(sorted(self.importance.items(), key=lambda kv: kv[1], reverse=True))
def ord_dic_to_df(self, dict:OrderedDict):
return pd.DataFrame([[k, v] for k, v in dict.items()], columns=['feature', 'importance'])
def plot_importance(self, df:pd.DataFrame, limit=20, asc=False, **kwargs):
"Plot importance with an optional limit to how many variables shown"
df_copy = df.copy()
df_copy['feature'] = df_copy['feature'].str.slice(0,25)
df_copy = df_copy.sort_values(by='importance', ascending=asc)[:limit].sort_values(by='importance', ascending=not(asc))
ax = df_copy.plot.barh(x='feature', y='importance', sort_columns=True, **kwargs)
for p in ax.patches:
ax.annotate(f'{p.get_width():.4f}', ((p.get_width() * 1.005), p.get_y() * 1.005))
imp = PermutationImportance(learn)
From this most important fields are PropertyField37
, PersonalField2
, PersonalField1
, SalesField5
import xgboost as xgb
n_estimators = 100
max_depth = 8
learning_rate = 0.1
subsample = 0.5
X_train, y_train = to.train.xs, to.train.ys.values.ravel()
X_valid, y_valid = to.valid.xs, to.valid.ys.values.ravel()
model = xgb.XGBClassifier(n_estimators = n_estimators, max_depth=max_depth, learning_rate=0.1, subsample=subsample)
xgb_model = model.fit(X_train, y_train)
xgb_preds = xgb_model.predict_proba(X_valid)
xgb_preds
accuracy(tensor(xgb_preds), tensor(y_valid))
from xgboost import plot_importance
plot_importance(xgb_model, height=1,max_num_features=20,)
From this most important fields were SalesField1A
, PersonalField9
, Original_Quote_Elapsed
, PersonalField10A
, PersonalField10B
, PropertyField37
avgs = (preds + xgb_preds) / 2
avgs
argmax = avgs.argmax(dim=1)
argmax
y_valid
accuracy(tensor(preds), tensor(y_valid))
accuracy(tensor(xgb_preds), tensor(y_valid))
accuracy(tensor(avgs), tensor(y_valid))
So we have a slightly better performance with ensembling these two
from sklearn.ensemble import RandomForestClassifier
tree = RandomForestClassifier(n_estimators=100)
tree.fit(X_train, y_train)
!pip install rfpimp
from rfpimp import *
impTree = importances(tree, X_valid, to.valid.ys)
plot_importances(impTree)
So here the most important are PropertyField37
, Field7
,PersonalField1
, SalesField5
, PersonalField9
,PersonalField2
forest_preds = tree.predict_proba(X_valid)
forest_preds
accuracy(tensor(forest_preds), tensor(y_valid))
new_avgs = (preds + xgb_preds + forest_preds) / 3
accuracy(tensor(new_avgs), tensor(y_valid))
So it gets slightly worse when we add Random Forest to the ensemble.
Next step will be to apply the models to the test set from Kaggle and try submissions to see how they score