This is a Colab notebook. Basic Random Forest Model without any hyperparameter tuning and feature engineering. Kaggle Score 0.953. This can be a baseline model

import pandas as pd
import numpy as np


from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_auc_score


import matplotlib.pyplot as plt

from fastbook import *
from fastai.tabular.all import *
from dtreeviz.trees import *
from IPython.display import Image, display_svg, SVG
import random as rd


pd.options.display.max_rows = 20
pd.options.display.max_columns = 8

Download Data From Kaggle

!mkdir -p ~/.kaggle
!cp /content/gdrive/MyDrive/Kaggle/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
path = Path('/content/gdrive/MyDrive/Kaggle/' + 'data/homesite-quote')
path.mkdir(parents=True, exist_ok=True)
path
Path('/content/gdrive/MyDrive/Kaggle/data/homesite-quote')
!kaggle competitions download -c homesite-quote-conversion -p /content/gdrive/MyDrive/Kaggle/data/homesite-quote
Warning: Looks like you're using an outdated API Version, please consider updating (server 1.5.12 / client 1.5.4)
Downloading train.csv.zip to /content/gdrive/MyDrive/Kaggle/data/homesite-quote
 89% 33.0M/37.1M [00:00<00:00, 63.7MB/s]
100% 37.1M/37.1M [00:00<00:00, 83.8MB/s]
Downloading sample_submission.csv.zip to /content/gdrive/MyDrive/Kaggle/data/homesite-quote
  0% 0.00/258k [00:00<?, ?B/s]
100% 258k/258k [00:00<00:00, 36.1MB/s]
Downloading test.csv.zip to /content/gdrive/MyDrive/Kaggle/data/homesite-quote
 53% 13.0M/24.7M [00:00<00:00, 68.0MB/s]
100% 24.7M/24.7M [00:00<00:00, 82.5MB/s]
! unzip -q -n '{path}/train.csv.zip' -d '{path}'
! unzip -q -n '{path}/test.csv.zip' -d '{path}'

Import Data

df = pd.read_csv(path/'train.csv', low_memory=False)
test_df = pd.read_csv(path/'test.csv', low_memory=False)

Data Prep

def drop_cols(df):
    df.drop(['Original_Quote_Date'],axis=1,inplace=True)
    return df
train_df=df
train_df = drop_cols(train_df)
test_df = drop_cols(test_df)
cols_to_delete = train_df.isna().sum()[train_df.isna().sum() > 0].index

def drop_cols_from_list(df,cols_to_delete):
    df.drop(cols_to_delete,axis=1,inplace=True)
    return df

train_df = drop_cols_from_list(train_df,cols_to_delete)
test_df = drop_cols_from_list(test_df,cols_to_delete)
cols_to_drop = []

for i in set(train_df.columns) - set(train_df._get_numeric_data().columns):
    if (train_df.loc[:,i].nunique() >= 3):
        cols_to_drop.append(i)
        
train_df = drop_cols_from_list(train_df,cols_to_drop)
test_df = drop_cols_from_list(test_df,cols_to_drop)
cls_to_encode = set(train_df.columns) - set(train_df._get_numeric_data().columns)

def ohe(df,cls_to_encode):
    df = pd.get_dummies(df,columns=cls_to_encode,drop_first=True)
    return df

train_df = ohe(train_df,cls_to_encode)
test_df = ohe(test_df,cls_to_encode)
test_df.drop(list(set(test_df.columns) - set(train_df.columns)),axis=1,inplace=True)

Model

X = train_df.drop('QuoteConversion_Flag',axis=1)
y = train_df.QuoteConversion_Flag
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y, random_state=42)
train_idx=pd.DataFrame(X_train.index)
valid_idx=pd.DataFrame(X_test.index)
train_idx.to_csv(path/'train_idx.csv',index=False)
valid_idx.to_csv(path/'valid_idx.csv',index=False)
rfc = RandomForestClassifier(n_jobs=-1)
rfc.fit(X_train,y_train)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)
print("train accuracy score = ", accuracy_score(y_train,rfc.predict(X_train)))
print("test accuracy score = ", accuracy_score(y_test,rfc.predict(X_test)))
train accuracy score =  0.9999952061821076
test accuracy score =  0.9162623919004429
roc_auc_score(y_train,rfc.predict(X_train)),roc_auc_score(y_test,rfc.predict(X_test))
(0.9999872171801099, 0.8078645543138884)
plot_roc_curve(rfc, X_test, y_test)
plt.show()
plot_confusion_matrix(rfc, X_test, y_test,values_format='d')
plt.show()
output_submission = pd.DataFrame(zip(test_df.QuoteNumber,rfc.predict_proba(test_df)[:,1]), columns = ['QuoteNumber','QuoteConversion_Flag'])
output_submission.to_csv(path/'output_submission.csv',index=False)