Pickled Model using ipywidgets
This notebook loads a previously trained model and uses it to predict quote success rate using user input to change fields. User input uses ipywidgets generated on the fly to match allow altering of the most sensitive fields. Final app source code available at https://github.com/timcu/fast-tabulous-app/blob/main/fast-tabulous-with-db.ipynb](https://github.com/timcu/fast-tabulous-app/blob/main/fast-tabulous-with-db.ipynb) . Final app can be used at [https://tabulous.pythonator.com
- Setup - load trained model
- Create a sensitivity analysis tool
- Application: Step 1 - Ask user for quote number
- Application: Step 2 - Do sensitivity analysis
- Application: Step 3 - Try altering values of sensitive fields
import logging
import random
import ipywidgets as widgets
import pandas as pd
import numpy as np
from fastai.tabular.all import *
from IPython.display import display
from IPython.utils import io # using io.capture_output
from sklearn.metrics import roc_auc_score
pd.options.mode.chained_assignment = None # default='warn'
logger = logging.getLogger("load_pickled_model")
logging.basicConfig(level=logging.INFO)
path = Path('data/homesite-quote')
learn = load_learner(path/"learn_empty_dls_0708.pkl")
if not(path/"homesite-quote-conversion.zip").exists():
from kaggle import api
api.competition_download_cli('homesite-quote-conversion', path=path)
file_extract(path/"homesite-quote-conversion.zip")
file_extract(path/"train.csv.zip")
file_extract(path/"test.csv.zip")
df_train = pd.read_csv(path/'train.csv', low_memory=False, parse_dates=['Original_Quote_Date'], index_col="QuoteNumber")
df_test = pd.read_csv(path/'test.csv', low_memory=False, parse_dates=['Original_Quote_Date'], index_col="QuoteNumber")
sr_conv = df_train['QuoteConversion_Flag']
df_train.drop('QuoteConversion_Flag', inplace=True, axis=1)
df = pd.concat([df_train, df_test])
df = add_datepart(df, 'Original_Quote_Date')
logger.debug(f"{df.shape} {df_train.shape} {df_test.shape} {sr_conv.shape}")
df_train = None
df_test = None
Create a sensitivity analysis tool
A field is sensitive if changing the value of the field can change the outcome of the predicted quote success
While logging is INFO some logging will occur during a normal run. Setting logging level to WARNING will only log if an unknown dtype is encountered. See setup above to set level.
def sensitivity_analysis(qn):
"""Using data from quote number qn do a sensitivity analysis on all independent variables"""
time_start = datetime.now()
# Independent variables
ind_original = df.loc[qn]
prd = learn.predict(ind_original)
# Predicted quote conversion flag
qcf_original = prd[1].item()
# Probability that quote conversion flag is as predicted
prb_original = prd[2][qcf_original].item()
logger.info(f"Sensitivity Analysis for Quote {qn}")
# Check if we actually know the correct answer
if qn in sr_conv.index:
logger.info(f"Actual QuoteConversion_Flag {sr_conv[qn]}")
def tf_sensitive(f, v_original, lst_v, p_original):
"""predicts quote success after changing field f from v_original to each value in lst_v.
If prediction changes then quote is sensitive to the value of this field and True is returned"""
# Create a DataFrame which has every row identical except for field in question
# Field f iterates through every value provided
ind_other = df.loc[qn:qn].copy().drop(f, axis=1) # fields other than f
ind_f = pd.DataFrame(data={f: lst_v}, index=[qn] * len(lst_v))
# Merge these two DataFrames to create one with all rows identical except field f
ind = pd.merge(ind_other, ind_f, right_index=True, left_index=True)
# Copy lines from learn.predict() because we want to predict several rows at once (faster than one at a time)
dl = learn.dls.test_dl(ind)
dl.dataset.conts = dl.dataset.conts.astype(np.float32)
# stop learn.get_preds() printing blank lines
with io.capture_output() as captured:
# using get_preds() rather than predict() because get_preds can do multiple rows at once
inp,preds,_,dec_preds = learn.get_preds(dl=dl, with_input=True, with_decoded=True)
tf = False
# Check if any predictions changed
for i, dp in enumerate(dec_preds):
qcf = dp.item()
if qcf != qcf_original:
prb = preds[i][qcf].item()
logger.info(f"Changing {f} from {val_original} to {lst_v[i]} changes prediction "
f"from {prb_original:.2%} {qcf_original} to {prb:.2%} {qcf}")
tf = True
return tf
set_sensitive = set()
# Loop through all fields. Check different values of each field to see if result is sensitive to it.
for field in df.columns:
ind = ind_original.copy()
val_original = ind[field]
tf_important = False
num_unique = df[field].nunique()
# If number of unique values is under 30 then try every value (or for objects try every value)
if num_unique < 30 or df.dtypes[field] == 'O':
lst_unique = df[field].unique()
if tf_sensitive(field, val_original, lst_unique, prb_original):
tf_important = True
if tf_important:
logger.info(f"Possible values of {field} are {lst_unique}")
set_sensitive.add(field)
else:
if df.dtypes[field] == "int64":
vmin = df[field].min()
vmax = df[field].max()
lst_val = [vmin + (vmax - vmin) * i // 10 for i in range(11)]
logger.debug(f"{field} {num_unique} {df.dtypes[field]!r} {vmin} {vmax} {lst_val}")
if tf_sensitive(field, val_original, lst_val, prb_original):
tf_important = True
elif df.dtypes[field] == "float64":
vmin = df[field].min()
vmax = df[field].max()
lst_val = [vmin + (vmax - vmin) * i / 10 for i in range(11)]
logger.debug(f"{field} {num_unique} {df.dtypes[field]!r} {vmin} {vmax} {lst_val}")
if tf_sensitive(field, val_original, lst_val, prb_original):
tf_important = True
else:
logger.warning(f"Unknown type {field} {num_unique} {df.dtypes[field]!r}")
if tf_important:
set_sensitive.add(field)
# return the set of fields which had individual effects on the prediction
logger.info(f"Time taken = {(datetime.now() - time_start).total_seconds()} seconds")
return set_sensitive
def lst_ind_value(df, field):
"""Return the list of independent values to be tested for field"""
num_unique = df[field].nunique()
# If number of unique values is under 30 then try every value (or for objects try every value)
if num_unique < 30 or df.dtypes[field] == 'O':
return df[field].unique()
else:
if df.dtypes[field] == "int64":
vmin = df[field].min()
vmax = df[field].max()
return [vmin + (vmax - vmin) * i // 10 for i in range(11)]
elif df.dtypes[field] == "float64":
vmin = df[field].min()
vmax = df[field].max()
return [vmin + (vmax - vmin) * i / 10 for i in range(11)]
else:
logger.warning(f"Unknown type {field} {num_unique} {df.dtypes[field]!r}")
return []
def tf_equal_or_nan(a, b):
if a == b:
return True
try:
if np.isnan(a) and np.isnan(b):
return True
except TypeError:
pass
return False
def df_for_field(qn, df, f, lst_v):
"""predicts quote success after changing field f from v_original to each value in lst_v.
If prediction changes then quote is sensitive to the value of this field and True is returned
Keyword arguments
qn: quote number
df: dataframe of quote independent values
f: field name
lst_v: list of alternative values of independent value in field f
Returns
dataframe of alternative values in field f and all other fields staying the same and a column called fieldname
"""
# Create a DataFrame which has every row identical except for field in question
# Field f iterates through every value provided
ind_other = df.loc[qn:qn].copy().drop(f, axis=1) # fields other than f
ind_f = pd.DataFrame(data={f: lst_v, "fieldname": [f] * len(lst_v)}, index=[qn] * len(lst_v))
# Merge these two DataFrames to create one with all rows identical except field f
return pd.merge(ind_other, ind_f, right_index=True, left_index=True)
def sensitivity_analysis(qn):
"""Using data from quote number qn do a sensitivity analysis on all independent variables"""
time_start = datetime.now()
# Independent variables
ind_original = df.loc[qn]
prd = learn.predict(ind_original)
# Predicted quote conversion flag
qcf_original = prd[1].item()
# Probability that quote conversion flag is as predicted
prb_original = prd[2][qcf_original].item()
logger.info(f"Sensitivity Analysis for Quote {qn}")
# Check if we actually know the correct answer
if qn in sr_conv.index:
logger.info(f"Actual QuoteConversion_Flag {sr_conv[qn]}")
lst_df_for_field = []
# Loop through all fields. Check different values of each field to see if result is sensitive to it.
for field in df.columns:
ind = ind_original.copy()
val_original = ind[field]
lst_val = lst_ind_value(df, field)
lst_df_for_field.append(df_for_field(qn, df, field, lst_val))
df_sensitivity = pd.concat(lst_df_for_field, ignore_index=True)
sr_fieldname = df_sensitivity['fieldname']
df_sensitivity.drop('fieldname', inplace=True, axis=1)
dl = learn.dls.test_dl(df_sensitivity)
dl.dataset.conts = dl.dataset.conts.astype(np.float32)
# stop learn.get_preds() printing blank lines
with io.capture_output() as captured:
# using get_preds() rather than predict() because get_preds can do multiple rows at once
inp,preds,_,dec_preds = learn.get_preds(dl=dl, with_input=True, with_decoded=True)
logger.info(f"Time taken = {(datetime.now() - time_start).total_seconds()} seconds")
df_results=pd.DataFrame({'fieldname': sr_fieldname, 'prob_success': preds[:,1]})
df_results.sort_values(by='prob_success', ascending=False, inplace=True)
return df_results, df_sensitivity
qn_min = sr_conv.index.min()
qn_max = sr_conv.index.max()
qn = random.randint(qn_min, qn_max)
wdg_quote_success = widgets.Label(value="")
def handle_quote_number_change(change):
global qn
qn = change.new
with io.capture_output() as captured:
prd = learn.predict(df.loc[qn])
qcf = prd[1].item()
prb = prd[2][qcf].item()
act = sr_conv[qn] if qn in sr_conv else "unknown"
wdg_quote_success.value = f"Quote {change.new} actual {act} predicted {prb:.2%} {qcf}"
style = {'description_width': 'initial', 'width': '500px'}
wdg_quote_number_text = widgets.BoundedIntText(description="Quote number", min=qn_min, max=qn_max, value=qn, style=style)
wdg_quote_number_slider = widgets.IntSlider(description="Quote number", min=qn_min, max=qn_max, value=qn, style=style, layout={'width': '600px'})
mylink = widgets.jslink((wdg_quote_number_text, 'value'), (wdg_quote_number_slider, 'value'))
wdg_quote_number_slider.observe(handle_quote_number_change, names='value')
display(wdg_quote_number_text)
display(wdg_quote_number_slider)
display(wdg_quote_success)
out = widgets.Output(layout={'border': '1px solid green'})
with out:
df_results, df_sensitivity = sensitivity_analysis(wdg_quote_number_slider.value)
# display(out)
df_results.head(20)
wdg_status = widgets.HTML(value=f"<h2>Quote {qn}</h2>")
def handle_input_change(change):
qn = wdg_quote_number_slider.value
ind = df.loc[qn].copy()
for w in lst_input:
if w.value == "nan":
v = np.nan
else:
v = w.value
print(qn, w.description, v)
ind[w.description] = v
with io.capture_output() as captured:
prd = learn.predict(ind)
print(prd[1], prd[2])
qcf = prd[1].item()
prb = prd[2][qcf].item()
act = sr_conv[qn] if qn in sr_conv else "unknown"
wdg_status.value = f"<h2>Quote {qn} actual {act} predicted {prb:.2%} {qcf}</h2>"
display(wdg_status)
qn = wdg_quote_number_slider.value
style = {'description_width': 'initial'}
def nan_if_nan(n):
"""Can't include np.nan in dropdowns as np.nan != np.nan. Instead use a str"""
try:
if np.isnan(n):
return "nan"
except TypeError as te:
pass
return n
i = 0
dct_fields = defaultdict(list)
while len(dct_fields.keys()) < 10 and i < df.shape[1]:
f = df_results.iloc[i, 0] # fieldname column
idx = df_results.index[i]
ind_val = df_sensitivity.loc[idx, f]
dct_fields[f].append(ind_val)
i += 1
priority = 0
lst_input = []
for f, lst_recommend in dct_fields.items():
priority += 1
num_unique = df[f].nunique()
lst_unique = sorted((str(nan_if_nan(u)), nan_if_nan(u)) for u in df[f].unique())
v = nan_if_nan(df.loc[qn,f])
tip = f"Priority {priority}. Initially {v}. Recommend {lst_recommend}"
lbl = widgets.HTML(value=f"{tip}")
if num_unique < 5 and len(lst_unique) < 4:
wdg = widgets.RadioButtons(options=lst_unique,
description=f,
description_tooltip=tip,
style=style,
value=v)
else:
wdg = widgets.Dropdown(options=lst_unique,
description=f,
description_tooltip=tip,
style=style,
value=v)
wdg.observe(handle_input_change, names='value')
display(widgets.HBox(children=[wdg, lbl]))
lst_input.append(wdg)