Install packages recommended in fastbook Ch09

!pip install -Uqq fastbook kaggle waterfallcharts treeinterpreter dtreeviz

     |████████████████████████████████| 727kB 7.6MB/s 
     |████████████████████████████████| 61kB 9.9MB/s 
     |████████████████████████████████| 51kB 8.6MB/s 
     |████████████████████████████████| 194kB 48.7MB/s 
     |████████████████████████████████| 1.2MB 52.9MB/s 
     |████████████████████████████████| 61kB 10.5MB/s 
     |████████████████████████████████| 61kB 10.4MB/s 
  Building wheel for waterfallcharts (setup.py) ... done
  Building wheel for dtreeviz (setup.py) ... done

import fastbook
fastbook.setup_book()

Mounted at /content/gdrive

from fastbook import *
from fastai.vision.widgets import *
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from fastai.tabular.all import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from dtreeviz.trees import *
from IPython.display import Image, display_svg, SVG

pd.options.display.max_rows = 20
pd.options.display.max_columns = 8

Upload your kaggle.json API key

btn_upload = widgets.FileUpload(description="kaggle.json")
btn_upload

Save credentials

cred_path = Path('~/.kaggle/kaggle.json').expanduser()
if not cred_path.parent.exists():
    cred_path.parent.mkdir()
if len(btn_upload.data) > 0:
    with open(cred_path, mode="wb") as cred_file:
        cred_file.write(btn_upload.data[-1])
cred_path.chmod(0o600)

from kaggle import api

Note that '!pip install kaggle' does not update cli kaggle in Google colab and is only v1.5.4 while kaggle.api is v1.5.12

!kaggle --version

Kaggle API 1.5.4

Python's kaggle.api is using a more recent version

api.__version__

'1.5.12'

Get data from kaggle, extract and store in _data

path_hqc = (Path.cwd()/"_data")
path_hqc.mkdir(exist_ok=True)
Path.BASE_PATH = path_hqc
api.competition_download_cli('homesite-quote-conversion', path=path_hqc)
file_extract(path_hqc/"homesite-quote-conversion.zip")
file_extract(path_hqc/"train.csv.zip")
file_extract(path_hqc/"test.csv.zip")

  0%|          | 0.00/62.0M [00:00<?, ?B/s]

Downloading homesite-quote-conversion.zip to /content/_data

100%|██████████| 62.0M/62.0M [00:00<00:00, 174MB/s]

Check what the data looks like

df = pd.read_csv(path_hqc/"train.csv", low_memory=False)
df.head()

Check how much data we have and check if QuoteNumber is unique

df.shape, len(df['QuoteNumber'].unique())

((260753, 299), 260753)

Conclusion: QuoteNumber is unique

We don't want to use QuoteNumber as a feature but we could use it as the index

df = df.set_index('QuoteNumber')

Examine data types in train.csv

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260753 entries, 1 to 434588
Columns: 298 entries, Original_Quote_Date to GeographicField64
dtypes: float64(6), int64(264), object(28)
memory usage: 594.8+ MB

Find the 28 fields which do not have numeric datatypes

from collections import defaultdict
dct_fields_by_dtype = defaultdict(list)
for i, dt in enumerate(df.dtypes):
    dct_fields_by_dtype[dt].append(df.dtypes.index[i])
print("dtypes in train.csv:", dct_fields_by_dtype.keys())
print("fields for object dtype:", dct_fields_by_dtype[np.dtype('O')])
print("number of fields of object dtype:", len(dct_fields_by_dtype[np.dtype('O')]))

dtypes in train.csv: dict_keys([dtype('O'), dtype('int64'), dtype('float64')])
fields for object dtype: ['Original_Quote_Date', 'Field6', 'Field10', 'Field12', 'CoverageField8', 'CoverageField9', 'SalesField7', 'PersonalField7', 'PersonalField16', 'PersonalField17', 'PersonalField18', 'PersonalField19', 'PropertyField3', 'PropertyField4', 'PropertyField5', 'PropertyField7', 'PropertyField14', 'PropertyField28', 'PropertyField30', 'PropertyField31', 'PropertyField32', 'PropertyField33', 'PropertyField34', 'PropertyField36', 'PropertyField37', 'PropertyField38', 'GeographicField63', 'GeographicField64']
number of fields of object dtype: 28

Original_Quote_Date can be converted to datetime

 df['Original_Quote_Date'] = pd.to_datetime(df['Original_Quote_Date'])

Recalculate breakdown now that we have changed dtype of Original_Quote_Date

dct_fields_by_dtype = defaultdict(list)
for i, dt in enumerate(df.dtypes):
    dct_fields_by_dtype[dt].append(df.dtypes.index[i])

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260753 entries, 1 to 434588
Columns: 298 entries, Original_Quote_Date to GeographicField64
dtypes: datetime64[ns](1), float64(6), int64(264), object(27)
memory usage: 594.8+ MB

Compare Original_Quote_Date in train.csv and test.csv

df_test = pd.read_csv(path_hqc/"test.csv", low_memory=False)
df_test["Original_Quote_Date"] = pd.to_datetime(df_test["Original_Quote_Date"])
print("train.csv", df['Original_Quote_Date'].min(), df['Original_Quote_Date'].max(), df.shape)
print("test.csv ", df_test['Original_Quote_Date'].min(), df_test['Original_Quote_Date'].max(), df_test.shape)

train.csv 2013-01-01 00:00:00 2015-05-18 00:00:00 (260753, 298)
test.csv  2013-01-01 00:00:00 2015-05-18 00:00:00 (173836, 298)

Conclusion: overlapping date ranges (in fact identical date ranges) so don't need to consider as time series problem

Check the non-numeric values in other object fields

for col in dct_fields_by_dtype[np.dtype('O')]:
    print(f"{col:20s} {df[col].unique()}")

Field6               ['B' 'F' 'J' 'E' 'C' 'K' 'A' 'D']
Field10              ['965' '548' '1,165' '1,487' '935' '564' '1,113' '1,480']
Field12              ['N' 'Y']
CoverageField8       ['T' 'Y' 'X' 'W' 'V' 'U' 'Z']
CoverageField9       ['D' 'E' 'J' 'F' 'A' 'G' 'K' 'C' 'L' 'B' 'I' 'H']
SalesField7          ['V' 'P' 'K' 'R' 'T' 'Q' 'M']
PersonalField7       ['N' 'Y' nan]
PersonalField16      ['ZA' 'XB' 'ZH' 'XO' 'YE' 'XR' 'ZG' 'ZF' 'XW' 'XS' 'ZT' 'XD' 'XH' 'XM' 'YH' 'ZD' 'XJ' 'ZN' 'YF' 'XX' 'XL' 'XQ' 'ZJ' 'ZR' 'ZW' 'XE' 'XC' 'ZK' 'XK' 'ZC' 'XZ' 'XI' 'ZE' 'ZU' 'YI' 'XP' 'ZO' 'ZP' 'ZB'
 'XF' 'ZS' 'XT' 'XY' 'ZQ' 'ZI' 'XV' 'XU' 'XN' 'ZV' 'ZL']
PersonalField17      ['ZE' 'YJ' 'XS' 'XE' 'XU' 'ZQ' 'YY' 'XV' 'ZF' 'XK' 'YS' 'ZK' 'YF' 'YV' 'XG' 'ZL' 'ZH' 'ZW' 'XH' 'ZU' 'YH' 'XC' 'ZV' 'XR' 'ZI' 'XX' 'YR' 'XW' 'ZC' 'YZ' 'YU' 'YX' 'ZA' 'ZP' 'XI' 'YN' 'YL' 'YK' 'ZN'
 'XT' 'ZT' 'XQ' 'XB' 'YI' 'YM' 'XL' 'YQ' 'ZG' 'ZS' 'YT' 'ZO' 'YE' 'XN' 'ZM' 'XM' 'YG' 'YP' 'XD' 'ZD' 'YW' 'XJ' 'ZB' 'XP' 'XO' 'ZR' 'XY']
PersonalField18      ['XR' 'YE' 'YP' 'YI' 'XQ' 'ZW' 'XT' 'XF' 'XS' 'YG' 'ZF' 'XZ' 'XI' 'XK' 'YF' 'ZE' 'YQ' 'ZP' 'YL' 'ZD' 'XW' 'YN' 'YK' 'ZJ' 'ZK' 'ZC' 'XU' 'ZN' 'XP' 'XL' 'XM' 'ZL' 'XC' 'ZH' 'XG' 'XN' 'XY' 'ZQ' 'XO'
 'ZT' 'XJ' 'ZA' 'ZU' 'XE' 'ZV' 'ZS' 'YR' 'YH' 'YJ' 'ZR' 'ZO' 'YO' 'ZM' 'XD' 'YM' 'XX' 'ZB' 'XH' 'XV' 'ZG' 'ZI']
PersonalField19      ['XD' 'XT' 'XC' 'XX' 'ZQ' 'ZT' 'ZO' 'YJ' 'ZN' 'YH' 'ZI' 'YN' 'YF' 'YK' 'XY' 'XI' 'ZA' 'ZW' 'ZV' 'XU' 'ZL' 'XK' 'XW' 'XF' 'ZK' 'YE' 'XB' 'XZ' 'XP' 'ZJ' 'YM' 'XO' 'YG' 'XN' 'ZR' 'ZE' 'ZB' 'ZG' 'YL'
 'ZF' 'XR' 'XJ' 'XM' 'ZP' 'XQ' 'XV' 'ZH' 'XE' 'ZU' 'ZM' 'XG' 'ZD' 'XH' 'XL' 'YI' 'XS' 'ZC']
PropertyField3       ['N' 'Y' nan]
PropertyField4       ['N' 'Y' nan]
PropertyField5       ['Y' 'N']
PropertyField7       ['O' 'N' 'R' 'D' 'S' 'J' 'I' 'Q' 'A' 'K' 'G' 'F' 'H' 'E' 'L' 'C' 'P' 'M' 'B']
PropertyField14      ['C' 'B' 'A' 'D']
PropertyField28      ['B' 'D' 'A' 'C']
PropertyField30      ['N' 'Y']
PropertyField31      ['N' 'O' 'K' 'M']
PropertyField32      ['Y' 'N' nan]
PropertyField33      ['G' 'H' 'E' 'F']
PropertyField34      ['Y' 'N' nan]
PropertyField36      ['N' 'Y' nan]
PropertyField37      ['N' 'Y']
PropertyField38      ['N' 'Y' nan]
GeographicField63    ['N' 'Y' ' ']
GeographicField64    ['CA' 'NJ' 'TX' 'IL']

Field10 looks like integers stored as strings so convert to ints

df['Field10'] = df['Field10'].str.replace(",", "").astype(int)

Recalculate breakdown now that we have changed dtype of Field10

dct_fields_by_dtype = defaultdict(list)
for i, dt in enumerate(df.dtypes):
    dct_fields_by_dtype[dt].append(df.dtypes.index[i])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260753 entries, 1 to 434588
Columns: 298 entries, Original_Quote_Date to GeographicField64
dtypes: datetime64[ns](1), float64(6), int64(265), object(26)
memory usage: 594.8+ MB

	QuoteNumber	Original_Quote_Date	Field6	...	GeographicField62A	GeographicField62B	GeographicField63	GeographicField64
0	1	2013-08-16	B	...	-1	10	N	CA
1	2	2014-04-22	F	...	-1	20	N	NJ
2	4	2014-08-25	F	...	-1	8	N	NJ
3	6	2013-04-15	J	...	-1	21	N	TX
4	8	2014-01-25	E	...	-1	12	N	IL