Random Forest and Data Exploration
This is a basic random forest model for the purposes of data exploration and establishing a basline performance which can be referenced when more advanced models are used. At the end of this notebook, the most important columns are identified.
!pip install -Uqq ipywidgets #This is already included in fastai but ah well
import ipywidgets as widgets
from pathlib import Path
def on_file_upload(change):
cred_path = Path('~/.kaggle/kaggle.json').expanduser()
if not cred_path.exists():
cred_path.parent.mkdir(exist_ok=True)
cred_path.write_text(str(btn_upload.data[-1])[2:-1])
cred_path.chmod(0o600)
btn_upload = widgets.FileUpload(accept='.json', multiple=False)
btn_upload.observe(on_file_upload, names='data')
display(widgets.Label('Upload \'kaggle.json\''), btn_upload)
!pip install -Uqq fastai
!pip install -Uqq kaggle
import pandas as pd
import numpy as np
from fastai.tabular.all import *
from kaggle import api
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_confusion_matrix
api.competition_download_cli("homesite-quote-conversion")
file_extract("homesite-quote-conversion.zip")
#Extract all sub-directories and delete .zip files
for item in os.listdir():
if item.endswith(".zip"): file_extract(item)
for item in os.listdir():
if item.endswith(".zip"): os.remove(item)
X_full = pd.read_csv("train.csv", parse_dates=['Original_Quote_Date'])
X_test_full = pd.read_csv("test.csv", parse_dates=['Original_Quote_Date'])
#Remove rows with missing target data (QuoteConversion_Flag)
X_full.dropna(axis=0, subset=['QuoteConversion_Flag'], inplace=True)
#Format dates
#This model actually performs better if dates are just dropped
#but I've included them just to demonstrate how this would be done
X_full["Year"] = X_full["Original_Quote_Date"].dt.year
X_full["Month"] = X_full["Original_Quote_Date"].dt.month
X_full["Day"] = X_full["Original_Quote_Date"].dt.day
X_test_full["Year"] = X_test_full["Original_Quote_Date"].dt.year
X_test_full["Month"] = X_test_full["Original_Quote_Date"].dt.month
X_test_full["Day"] = X_test_full["Original_Quote_Date"].dt.day
X_full = X_full.drop("Original_Quote_Date", axis=1)
X_test_full = X_test_full.drop("Original_Quote_Date", axis=1)
#Seperate target column from the rest of the data (seperate X from Y)
y = X_full.QuoteConversion_Flag
X_full.drop(['QuoteConversion_Flag'], axis=1, inplace=True)
#Split data for training
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, train_size=0.8, test_size=0.2, random_state=42)
#Select numrical columns
numerical_cols = [col for col in X_train_full.columns if X_train_full[col].dtype in ['int64', 'float64']]
#Select categorical columns (must have cardinality < 10)
categorical_cols = [col for col in X_train_full.columns if X_train_full[col].dtype == 'object' and X_train_full[col].nunique() < 10]
#Combine together to get all columns
selected_cols = numerical_cols + categorical_cols
X_train = X_train_full[selected_cols].copy()
X_valid = X_valid_full[selected_cols].copy()
X_test = X_test_full[selected_cols].copy()
numerical_transformer = SimpleImputer(strategy='mean')
#Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
#Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
])
#Define model
model = RandomForestClassifier(n_estimators=100, random_state=0)
#Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[
('preprocessor', preprocessor),
('model', model)
])
clf.fit(X_train, y_train)
#Preprocessing of validation data, get predictions
preds = clf.predict(X_valid)
preds_final = clf.predict(X_test)
#Format and output to csv file
df_final = pd.DataFrame({
"QuoteNumber": X_test.QuoteNumber,
"QuoteConversion_Flag": preds_final
})
df_final.to_csv("submission.csv", index=False)
print("ROC AUC Score: " + str(roc_auc_score(y_valid, preds)))
plot_roc_curve(clf, X_valid, y_valid)
plot_confusion_matrix(clf, X_valid, y_valid, values_format='d')
from sklearn.feature_selection._base import SelectorMixin
from sklearn.feature_extraction.text import _VectorizerMixin
def get_feature_out(estimator, feature_in):
if hasattr(estimator,'get_feature_names'):
if isinstance(estimator, _VectorizerMixin):
# handling all vectorizers
return [f'vec_{f}' \
for f in estimator.get_feature_names()]
else:
return estimator.get_feature_names(feature_in)
elif isinstance(estimator, SelectorMixin):
return np.array(feature_in)[estimator.get_support()]
else:
return feature_in
def get_feature_names(ct):
"""
handles all estimators, pipelines inside ColumnTransfomer
doesn't work when remainder =='passthrough'
which requires the input column names.
"""
output_features = []
for name, estimator, features in ct.transformers_:
if name!='remainder':
if isinstance(estimator, Pipeline):
current_features = features
for step in estimator:
current_features = get_feature_out(step, current_features)
features_out = current_features
else:
features_out = get_feature_out(estimator, features)
output_features.extend(features_out)
elif estimator=='passthrough':
output_features.extend(ct._feature_names_in[features])
return output_features
#but a simple concat would probably also work...
#np.concatenate((np.array(numerical_cols),
# clf[0].transformers_[1][1][1].get_feature_names()))
df_importance = pd.DataFrame({'cols':get_feature_names(clf[0]),
'imp':clf.steps[1][1].feature_importances_}).sort_values('imp', ascending=False)
def plot_fi(fi):
return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)
plot_fi(df_importance[:30])
Unimportant features may be removed from the model to make the data more manageable