import pandas as pd
import torch as nn
import numpy as np
from pathlib import Path
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
path = Path('../data/')
path.ls()

df = pd.read_csv(path/'mushrooms.csv')
df.describe()
class cap-shape cap-surface cap-color bruises odor gill-attachment gill-spacing gill-size gill-color stalk-shape stalk-root stalk-surface-above-ring stalk-surface-below-ring stalk-color-above-ring stalk-color-below-ring veil-type veil-color ring-number ring-type spore-print-color population habitat
count 8124 8124 8124 8124 8124 8124 8124 8124 8124 8124 8124 8124 8124 8124 8124 8124 8124 8124 8124 8124 8124 8124 8124
unique 2 6 4 10 2 9 2 2 2 12 2 5 4 4 9 9 1 4 3 5 9 6 7
top e x y n f n f c b b t b s s w w p w o p w v d
freq 4208 3656 3244 2284 4748 3528 7914 6812 5612 1728 4608 3776 5176 4936 4464 4384 8124 7924 7488 3968 2388 4040 3148
# Show me missing values
df.isnull().sum()

# Option #1 for Cabin: Delete the missing datapoints for Cabin
# df.drop('Cabin', axis=1, inplace=True)

# Option #2 for Cabin: Impute the missing datapoints for Cabin via mode
# df.Age.fillna(df.Age.mode(), inplace = True)

# Impute the missing datapoints for Age
# df.Age.fillna(df.Age.mode(), inplace = True)
AttributeError: 'DataFrame' object has no attribute 'Age'
cat_columns = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-root', 'stalk-shape', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']
df = pd.get_dummies(df, columns=cat_columns)
df.head()
class_e class_p cap-shape_b cap-shape_c cap-shape_f cap-shape_k cap-shape_s cap-shape_x cap-surface_f cap-surface_g cap-surface_s cap-surface_y cap-color_b cap-color_c cap-color_e cap-color_g cap-color_n cap-color_p cap-color_r cap-color_u cap-color_w cap-color_y bruises_f bruises_t odor_a odor_c odor_f odor_l odor_m odor_n odor_p odor_s odor_y gill-attachment_a gill-attachment_f gill-spacing_c gill-spacing_w gill-size_b gill-size_n gill-color_b gill-color_e gill-color_g gill-color_h gill-color_k gill-color_n gill-color_o gill-color_p gill-color_r gill-color_u gill-color_w gill-color_y stalk-root_? stalk-root_b stalk-root_c stalk-root_e stalk-root_r stalk-shape_e stalk-shape_t stalk-surface-above-ring_f stalk-surface-above-ring_k stalk-surface-above-ring_s stalk-surface-above-ring_y stalk-surface-below-ring_f stalk-surface-below-ring_k stalk-surface-below-ring_s stalk-surface-below-ring_y stalk-color-above-ring_b stalk-color-above-ring_c stalk-color-above-ring_e stalk-color-above-ring_g stalk-color-above-ring_n stalk-color-above-ring_o stalk-color-above-ring_p stalk-color-above-ring_w stalk-color-above-ring_y stalk-color-below-ring_b stalk-color-below-ring_c stalk-color-below-ring_e stalk-color-below-ring_g stalk-color-below-ring_n stalk-color-below-ring_o stalk-color-below-ring_p stalk-color-below-ring_w stalk-color-below-ring_y veil-type_p veil-color_n veil-color_o veil-color_w veil-color_y ring-number_n ring-number_o ring-number_t ring-type_e ring-type_f ring-type_l ring-type_n ring-type_p spore-print-color_b spore-print-color_h spore-print-color_k spore-print-color_n spore-print-color_o spore-print-color_r spore-print-color_u spore-print-color_w spore-print-color_y population_a population_c population_n population_s population_v population_y habitat_d habitat_g habitat_l habitat_m habitat_p habitat_u habitat_w
0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0
1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0
2 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0
3 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0
4 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0
df.keys().tolist()
['class_e',
 'class_p',
 'cap-shape_b',
 'cap-shape_c',
 'cap-shape_f',
 'cap-shape_k',
 'cap-shape_s',
 'cap-shape_x',
 'cap-surface_f',
 'cap-surface_g',
 'cap-surface_s',
 'cap-surface_y',
 'cap-color_b',
 'cap-color_c',
 'cap-color_e',
 'cap-color_g',
 'cap-color_n',
 'cap-color_p',
 'cap-color_r',
 'cap-color_u',
 'cap-color_w',
 'cap-color_y',
 'bruises_f',
 'bruises_t',
 'odor_a',
 'odor_c',
 'odor_f',
 'odor_l',
 'odor_m',
 'odor_n',
 'odor_p',
 'odor_s',
 'odor_y',
 'gill-attachment_a',
 'gill-attachment_f',
 'gill-spacing_c',
 'gill-spacing_w',
 'gill-size_b',
 'gill-size_n',
 'gill-color_b',
 'gill-color_e',
 'gill-color_g',
 'gill-color_h',
 'gill-color_k',
 'gill-color_n',
 'gill-color_o',
 'gill-color_p',
 'gill-color_r',
 'gill-color_u',
 'gill-color_w',
 'gill-color_y',
 'stalk-root_?',
 'stalk-root_b',
 'stalk-root_c',
 'stalk-root_e',
 'stalk-root_r',
 'stalk-shape_e',
 'stalk-shape_t',
 'stalk-surface-above-ring_f',
 'stalk-surface-above-ring_k',
 'stalk-surface-above-ring_s',
 'stalk-surface-above-ring_y',
 'stalk-surface-below-ring_f',
 'stalk-surface-below-ring_k',
 'stalk-surface-below-ring_s',
 'stalk-surface-below-ring_y',
 'stalk-color-above-ring_b',
 'stalk-color-above-ring_c',
 'stalk-color-above-ring_e',
 'stalk-color-above-ring_g',
 'stalk-color-above-ring_n',
 'stalk-color-above-ring_o',
 'stalk-color-above-ring_p',
 'stalk-color-above-ring_w',
 'stalk-color-above-ring_y',
 'stalk-color-below-ring_b',
 'stalk-color-below-ring_c',
 'stalk-color-below-ring_e',
 'stalk-color-below-ring_g',
 'stalk-color-below-ring_n',
 'stalk-color-below-ring_o',
 'stalk-color-below-ring_p',
 'stalk-color-below-ring_w',
 'stalk-color-below-ring_y',
 'veil-type_p',
 'veil-color_n',
 'veil-color_o',
 'veil-color_w',
 'veil-color_y',
 'ring-number_n',
 'ring-number_o',
 'ring-number_t',
 'ring-type_e',
 'ring-type_f',
 'ring-type_l',
 'ring-type_n',
 'ring-type_p',
 'spore-print-color_b',
 'spore-print-color_h',
 'spore-print-color_k',
 'spore-print-color_n',
 'spore-print-color_o',
 'spore-print-color_r',
 'spore-print-color_u',
 'spore-print-color_w',
 'spore-print-color_y',
 'population_a',
 'population_c',
 'population_n',
 'population_s',
 'population_v',
 'population_y',
 'habitat_d',
 'habitat_g',
 'habitat_l',
 'habitat_m',
 'habitat_p',
 'habitat_u',
 'habitat_w']
t_dep = nn.tensor(df['class_p'])
indep_cols = [key for key in df.keys().tolist() if 'class' not in key]
indep_cols
['cap-shape_b',
 'cap-shape_c',
 'cap-shape_f',
 'cap-shape_k',
 'cap-shape_s',
 'cap-shape_x',
 'cap-surface_f',
 'cap-surface_g',
 'cap-surface_s',
 'cap-surface_y',
 'cap-color_b',
 'cap-color_c',
 'cap-color_e',
 'cap-color_g',
 'cap-color_n',
 'cap-color_p',
 'cap-color_r',
 'cap-color_u',
 'cap-color_w',
 'cap-color_y',
 'bruises_f',
 'bruises_t',
 'odor_a',
 'odor_c',
 'odor_f',
 'odor_l',
 'odor_m',
 'odor_n',
 'odor_p',
 'odor_s',
 'odor_y',
 'gill-attachment_a',
 'gill-attachment_f',
 'gill-spacing_c',
 'gill-spacing_w',
 'gill-size_b',
 'gill-size_n',
 'gill-color_b',
 'gill-color_e',
 'gill-color_g',
 'gill-color_h',
 'gill-color_k',
 'gill-color_n',
 'gill-color_o',
 'gill-color_p',
 'gill-color_r',
 'gill-color_u',
 'gill-color_w',
 'gill-color_y',
 'stalk-root_?',
 'stalk-root_b',
 'stalk-root_c',
 'stalk-root_e',
 'stalk-root_r',
 'stalk-shape_e',
 'stalk-shape_t',
 'stalk-surface-above-ring_f',
 'stalk-surface-above-ring_k',
 'stalk-surface-above-ring_s',
 'stalk-surface-above-ring_y',
 'stalk-surface-below-ring_f',
 'stalk-surface-below-ring_k',
 'stalk-surface-below-ring_s',
 'stalk-surface-below-ring_y',
 'stalk-color-above-ring_b',
 'stalk-color-above-ring_c',
 'stalk-color-above-ring_e',
 'stalk-color-above-ring_g',
 'stalk-color-above-ring_n',
 'stalk-color-above-ring_o',
 'stalk-color-above-ring_p',
 'stalk-color-above-ring_w',
 'stalk-color-above-ring_y',
 'stalk-color-below-ring_b',
 'stalk-color-below-ring_c',
 'stalk-color-below-ring_e',
 'stalk-color-below-ring_g',
 'stalk-color-below-ring_n',
 'stalk-color-below-ring_o',
 'stalk-color-below-ring_p',
 'stalk-color-below-ring_w',
 'stalk-color-below-ring_y',
 'veil-type_p',
 'veil-color_n',
 'veil-color_o',
 'veil-color_w',
 'veil-color_y',
 'ring-number_n',
 'ring-number_o',
 'ring-number_t',
 'ring-type_e',
 'ring-type_f',
 'ring-type_l',
 'ring-type_n',
 'ring-type_p',
 'spore-print-color_b',
 'spore-print-color_h',
 'spore-print-color_k',
 'spore-print-color_n',
 'spore-print-color_o',
 'spore-print-color_r',
 'spore-print-color_u',
 'spore-print-color_w',
 'spore-print-color_y',
 'population_a',
 'population_c',
 'population_n',
 'population_s',
 'population_v',
 'population_y',
 'habitat_d',
 'habitat_g',
 'habitat_l',
 'habitat_m',
 'habitat_p',
 'habitat_u',
 'habitat_w']
t_indep = nn.tensor(df[indep_cols].values, dtype=nn.float)
vals,indices = t_indep.max(dim=0)
t_indep = t_indep / vals
t_indep.shape
torch.Size([8124, 117])
import torch.nn.functional as F
nn.manual_seed(442)

# def calc_preds(coeffs, indeps): 
#   return nn.sigmoid((indeps*coeffs).sum(axis=1))

def calc_preds(coeffs, indeps):
    l1,l2,const = coeffs
    res = F.relu(indeps@l1)
    res = res@l2 + const
    return nn.sigmoid(res)

def calc_loss(coeffs, indeps, deps): 
  # calculate the predictions - mean
  # mean absolute error
  return nn.abs(calc_preds(coeffs, indeps)-deps).mean()

def init_coeffs(n_hidden=20):
    n_coeff = t_indep.shape[1]
    layer1 = (nn.rand(n_coeff, n_hidden)-0.5)/n_hidden
    layer2 = nn.rand(n_hidden, 1)-0.3
    const = nn.rand(1)[0]
    return layer1.requires_grad_(),layer2.requires_grad_(),const.requires_grad_()
# def init_weights():
#   n_coeff = t_indep.shape[1]
#   coeffs = nn.rand(n_coeff)-0.5
#   return coeffs.requires_grad_()
from fastai.data.transforms import RandomSplitter
trn_split,val_split=RandomSplitter(seed=42)(df)

trn_indep,val_indep = t_indep[trn_split],t_indep[val_split]
trn_dep,val_dep = t_dep[trn_split],t_dep[val_split]
len(trn_indep),len(val_indep)
(6500, 1624)
def update_coeffs(coeffs, lr):
    for layer in coeffs:
        layer.sub_(layer.grad * lr)
        layer.grad.zero_()

def one_epoch(coeffs, lr):
    loss = calc_loss(coeffs, trn_indep, trn_dep)
    loss.backward()
    with nn.no_grad(): update_coeffs(coeffs, lr)
    print(f"{loss:.3f}", end="; ")

def train_model(epochs=30, lr=0.01):
    nn.manual_seed(442)
    coeffs = init_coeffs()
    for i in range(epochs): one_epoch(coeffs, lr=lr)
    return coeffs

coeffs = train_model(18, lr=0.2)
0.507; 0.507; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 
def show_coeffs(): return dict(zip(indep_cols, coeffs.requires_grad_(False)))
show_coeffs()
AttributeError: 'tuple' object has no attribute 'requires_grad_'
def acc(coeffs): 
  return (val_dep.bool()==(calc_preds(coeffs, val_indep)>0.5)).float().mean()

acc(coeffs)
tensor(0.4723)