import logging
import random

import ipywidgets as widgets
import pandas as pd
import numpy as np

from fastai.tabular.all import *
from IPython.display import display
from IPython.utils import io  # using io.capture_output
from sklearn.metrics import roc_auc_score

pd.options.mode.chained_assignment = None  # default='warn'
logger = logging.getLogger("load_pickled_model")
logging.basicConfig(level=logging.INFO)

Setup - load trained model

On GPU instance run the following command to save TabularLearner

learn.export(fname="learn_empty_dls_0708.pkl")
path = Path('data/homesite-quote')
learn = load_learner(path/"learn_empty_dls_0708.pkl")

if not(path/"homesite-quote-conversion.zip").exists():
    from kaggle import api
    api.competition_download_cli('homesite-quote-conversion', path=path)
    file_extract(path/"homesite-quote-conversion.zip")
    file_extract(path/"train.csv.zip")
    file_extract(path/"test.csv.zip")

df_train = pd.read_csv(path/'train.csv', low_memory=False, parse_dates=['Original_Quote_Date'], index_col="QuoteNumber")
df_test = pd.read_csv(path/'test.csv', low_memory=False, parse_dates=['Original_Quote_Date'], index_col="QuoteNumber")
sr_conv = df_train['QuoteConversion_Flag']
df_train.drop('QuoteConversion_Flag', inplace=True, axis=1)
df = pd.concat([df_train, df_test])
df = add_datepart(df, 'Original_Quote_Date')
logger.debug(f"{df.shape} {df_train.shape} {df_test.shape} {sr_conv.shape}")
df_train = None
df_test = None

Create a sensitivity analysis tool

A field is sensitive if changing the value of the field can change the outcome of the predicted quote success

While logging is INFO some logging will occur during a normal run. Setting logging level to WARNING will only log if an unknown dtype is encountered. See setup above to set level.

def sensitivity_analysis(qn):
    """Using data from quote number qn do a sensitivity analysis on all independent variables"""
    time_start = datetime.now()
    # Independent variables
    ind_original = df.loc[qn]
    prd = learn.predict(ind_original)
    # Predicted quote conversion flag
    qcf_original = prd[1].item()
    # Probability that quote conversion flag is as predicted
    prb_original = prd[2][qcf_original].item()
    logger.info(f"Sensitivity Analysis for Quote {qn}")
    # Check if we actually know the correct answer
    if qn in sr_conv.index:
        logger.info(f"Actual QuoteConversion_Flag {sr_conv[qn]}")

    def tf_sensitive(f, v_original, lst_v, p_original):
        """predicts quote success after changing field f from v_original to each value in lst_v. 
        If prediction changes then quote is sensitive to the value of this field and True is returned"""
        # Create a DataFrame which has every row identical except for field in question
        # Field f iterates through every value provided
        ind_other = df.loc[qn:qn].copy().drop(f, axis=1)  # fields other than f
        ind_f = pd.DataFrame(data={f: lst_v}, index=[qn] * len(lst_v))
        # Merge these two DataFrames to create one with all rows identical except field f
        ind = pd.merge(ind_other, ind_f, right_index=True, left_index=True)
        # Copy lines from learn.predict() because we want to predict several rows at once (faster than one at a time)
        dl = learn.dls.test_dl(ind)
        dl.dataset.conts = dl.dataset.conts.astype(np.float32)
        # stop learn.get_preds() printing blank lines
        with io.capture_output() as captured:
            # using get_preds() rather than predict() because get_preds can do multiple rows at once
            inp,preds,_,dec_preds = learn.get_preds(dl=dl, with_input=True, with_decoded=True)
        tf = False
        # Check if any predictions changed
        for i, dp in enumerate(dec_preds):
            qcf = dp.item()
            if qcf != qcf_original:
                prb = preds[i][qcf].item()
                logger.info(f"Changing {f} from {val_original} to {lst_v[i]} changes prediction "
                            f"from {prb_original:.2%} {qcf_original} to {prb:.2%} {qcf}")
                tf = True
        return tf

    set_sensitive = set()
    # Loop through all fields. Check different values of each field to see if result is sensitive to it.
    for field in df.columns:
        ind = ind_original.copy()
        val_original = ind[field]
        tf_important = False
        num_unique = df[field].nunique()
        # If number of unique values is under 30 then try every value (or for objects try every value)
        if num_unique < 30 or df.dtypes[field] == 'O':
            lst_unique = df[field].unique()
            if tf_sensitive(field, val_original, lst_unique, prb_original):
                tf_important = True
            if tf_important:
                logger.info(f"Possible values of {field} are {lst_unique}")
                set_sensitive.add(field)
        else:
            if df.dtypes[field] == "int64":
                vmin = df[field].min()
                vmax = df[field].max()
                lst_val = [vmin + (vmax - vmin) * i // 10 for i in range(11)]
                logger.debug(f"{field} {num_unique} {df.dtypes[field]!r} {vmin} {vmax} {lst_val}")
                if tf_sensitive(field, val_original, lst_val, prb_original):
                    tf_important = True
            elif df.dtypes[field] == "float64":
                vmin = df[field].min()
                vmax = df[field].max()
                lst_val = [vmin + (vmax - vmin) * i / 10 for i in range(11)]
                logger.debug(f"{field} {num_unique} {df.dtypes[field]!r} {vmin} {vmax} {lst_val}")
                if tf_sensitive(field, val_original, lst_val, prb_original):
                    tf_important = True
            else:
                logger.warning(f"Unknown type {field} {num_unique} {df.dtypes[field]!r}")
            if tf_important:
                set_sensitive.add(field)
        # return the set of fields which had individual effects on the prediction
    logger.info(f"Time taken = {(datetime.now() - time_start).total_seconds()} seconds")
    return set_sensitive

def lst_ind_value(df, field):
    """Return the list of independent values to be tested for field"""
    num_unique = df[field].nunique()
    # If number of unique values is under 30 then try every value (or for objects try every value)
    if num_unique < 30 or df.dtypes[field] == 'O':
        return df[field].unique()
    else:
        if df.dtypes[field] == "int64":
            vmin = df[field].min()
            vmax = df[field].max()
            return [vmin + (vmax - vmin) * i // 10 for i in range(11)]
        elif df.dtypes[field] == "float64":
            vmin = df[field].min()
            vmax = df[field].max()
            return [vmin + (vmax - vmin) * i / 10 for i in range(11)]
        else:
            logger.warning(f"Unknown type {field} {num_unique} {df.dtypes[field]!r}")
            return []

def tf_equal_or_nan(a, b):
    if a == b:
        return True
    try:
        if np.isnan(a) and np.isnan(b):
            return True
    except TypeError:
        pass
    return False
        
def df_for_field(qn, df, f, lst_v):
    """predicts quote success after changing field f from v_original to each value in lst_v.
    If prediction changes then quote is sensitive to the value of this field and True is returned
    Keyword arguments
        qn: quote number 
        df: dataframe of quote independent values
        f: field name
        lst_v: list of alternative values of independent value in field f
    Returns
        dataframe of alternative values in field f and all other fields staying the same and a column called fieldname
    """
    # Create a DataFrame which has every row identical except for field in question
    # Field f iterates through every value provided
    ind_other = df.loc[qn:qn].copy().drop(f, axis=1)  # fields other than f
    ind_f = pd.DataFrame(data={f: lst_v, "fieldname": [f] * len(lst_v)}, index=[qn] * len(lst_v))
    # Merge these two DataFrames to create one with all rows identical except field f
    return pd.merge(ind_other, ind_f, right_index=True, left_index=True)

def sensitivity_analysis(qn):
    """Using data from quote number qn do a sensitivity analysis on all independent variables"""
    time_start = datetime.now()
    # Independent variables
    ind_original = df.loc[qn]
    prd = learn.predict(ind_original)
    # Predicted quote conversion flag
    qcf_original = prd[1].item()
    # Probability that quote conversion flag is as predicted
    prb_original = prd[2][qcf_original].item()
    logger.info(f"Sensitivity Analysis for Quote {qn}")
    # Check if we actually know the correct answer
    if qn in sr_conv.index:
        logger.info(f"Actual QuoteConversion_Flag {sr_conv[qn]}")

    lst_df_for_field = []
    # Loop through all fields. Check different values of each field to see if result is sensitive to it.
    for field in df.columns:
        ind = ind_original.copy()
        val_original = ind[field]
        lst_val = lst_ind_value(df, field)
        lst_df_for_field.append(df_for_field(qn, df, field, lst_val))
    df_sensitivity = pd.concat(lst_df_for_field, ignore_index=True)
    sr_fieldname = df_sensitivity['fieldname']
    df_sensitivity.drop('fieldname', inplace=True, axis=1)
    dl = learn.dls.test_dl(df_sensitivity)
    dl.dataset.conts = dl.dataset.conts.astype(np.float32)
    # stop learn.get_preds() printing blank lines
    with io.capture_output() as captured:
        # using get_preds() rather than predict() because get_preds can do multiple rows at once
        inp,preds,_,dec_preds = learn.get_preds(dl=dl, with_input=True, with_decoded=True)
    logger.info(f"Time taken = {(datetime.now() - time_start).total_seconds()} seconds")
    df_results=pd.DataFrame({'fieldname': sr_fieldname, 'prob_success': preds[:,1]})
    df_results.sort_values(by='prob_success', ascending=False, inplace=True)
    return df_results, df_sensitivity

Application: Step 1 - Ask user for quote number

Try quote 325710 for a quote with many fields which could be changed

qn_min = sr_conv.index.min()
qn_max = sr_conv.index.max()
qn = random.randint(qn_min, qn_max)

wdg_quote_success = widgets.Label(value="")
def handle_quote_number_change(change):
    global qn
    qn = change.new
    with io.capture_output() as captured:
        prd = learn.predict(df.loc[qn])
    qcf = prd[1].item()
    prb = prd[2][qcf].item()
    act = sr_conv[qn] if qn in sr_conv else "unknown"
    wdg_quote_success.value = f"Quote {change.new} actual {act} predicted {prb:.2%} {qcf}"
style = {'description_width': 'initial', 'width': '500px'}
wdg_quote_number_text = widgets.BoundedIntText(description="Quote number", min=qn_min, max=qn_max, value=qn, style=style)
wdg_quote_number_slider = widgets.IntSlider(description="Quote number", min=qn_min, max=qn_max, value=qn, style=style, layout={'width': '600px'})
mylink = widgets.jslink((wdg_quote_number_text, 'value'), (wdg_quote_number_slider, 'value'))
wdg_quote_number_slider.observe(handle_quote_number_change, names='value')
display(wdg_quote_number_text)
display(wdg_quote_number_slider)
display(wdg_quote_success)

Application: Step 2 - Do sensitivity analysis

out = widgets.Output(layout={'border': '1px solid green'})
with out:
    df_results, df_sensitivity = sensitivity_analysis(wdg_quote_number_slider.value)
# display(out)
df_results.head(20)
fieldname prob_success
2084 PropertyField29 1.000000
2109 PropertyField37 0.999607
627 PersonalField2 0.991576
1629 PersonalField84 0.991404
765 PersonalField13 0.990831
1112 PersonalField27 0.982206
1114 PersonalField27 0.976485
1113 PersonalField27 0.969538
1117 PersonalField27 0.964867
2110 PropertyField37 0.960644
1115 PersonalField27 0.948517
1119 PersonalField27 0.937945
1118 PersonalField27 0.869847
1120 PersonalField27 0.848974
1125 PersonalField27 0.821979
762 PersonalField12 0.821645
1121 PersonalField27 0.818617
1116 PersonalField27 0.806710
1123 PersonalField27 0.779141
766 PersonalField13 0.765224

Application: Step 3 - Try altering values of sensitive fields

You can enter more than one to try to improve probability of quote success

Example CoverageField9 from E to B and SalesField10 from 0 to 6

wdg_status = widgets.HTML(value=f"<h2>Quote {qn}</h2>")

def handle_input_change(change):
    qn = wdg_quote_number_slider.value
    ind = df.loc[qn].copy()
    for w in lst_input:
        if w.value == "nan":
            v = np.nan
        else:
            v = w.value
        print(qn, w.description, v)
        ind[w.description] = v
    with io.capture_output() as captured:
        prd = learn.predict(ind)
    print(prd[1], prd[2])
    qcf = prd[1].item()
    prb = prd[2][qcf].item()
    act = sr_conv[qn] if qn in sr_conv else "unknown"
    wdg_status.value = f"<h2>Quote {qn} actual {act} predicted {prb:.2%} {qcf}</h2>"

display(wdg_status)
qn = wdg_quote_number_slider.value
style = {'description_width': 'initial'}
def nan_if_nan(n):
    """Can't include np.nan in dropdowns as np.nan != np.nan. Instead use a str"""
    try:
        if np.isnan(n):
            return "nan"
    except TypeError as te:
        pass
    return n

i = 0
dct_fields = defaultdict(list)
while len(dct_fields.keys()) < 10 and i < df.shape[1]:
    f = df_results.iloc[i, 0]  # fieldname column
    idx = df_results.index[i]
    ind_val = df_sensitivity.loc[idx, f]
    dct_fields[f].append(ind_val)
    i += 1
priority = 0
lst_input = []
for f, lst_recommend in dct_fields.items():
    priority += 1
    num_unique = df[f].nunique()
    lst_unique = sorted((str(nan_if_nan(u)), nan_if_nan(u)) for u in df[f].unique())
    v = nan_if_nan(df.loc[qn,f])
    tip = f"Priority {priority}. Initially {v}. Recommend {lst_recommend}"
    lbl = widgets.HTML(value=f"{tip}")
    if num_unique < 5 and len(lst_unique) < 4:
        wdg = widgets.RadioButtons(options=lst_unique, 
                                   description=f, 
                                   description_tooltip=tip,
                                   style=style, 
                                   value=v)
    else:
        wdg = widgets.Dropdown(options=lst_unique, 
                               description=f, 
                               description_tooltip=tip,
                               style=style, 
                               value=v)
    wdg.observe(handle_input_change, names='value')
    display(widgets.HBox(children=[wdg, lbl]))
    lst_input.append(wdg)