import pandas as pd
import torch as nn
import numpy as np
from pathlib import Path
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
= Path('../data/')
path
path.ls()
= pd.read_csv(path/'mushrooms.csv') df
df.describe()
class | cap-shape | cap-surface | cap-color | bruises | odor | gill-attachment | gill-spacing | gill-size | gill-color | stalk-shape | stalk-root | stalk-surface-above-ring | stalk-surface-below-ring | stalk-color-above-ring | stalk-color-below-ring | veil-type | veil-color | ring-number | ring-type | spore-print-color | population | habitat | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 8124 | 8124 | 8124 | 8124 | 8124 | 8124 | 8124 | 8124 | 8124 | 8124 | 8124 | 8124 | 8124 | 8124 | 8124 | 8124 | 8124 | 8124 | 8124 | 8124 | 8124 | 8124 | 8124 |
unique | 2 | 6 | 4 | 10 | 2 | 9 | 2 | 2 | 2 | 12 | 2 | 5 | 4 | 4 | 9 | 9 | 1 | 4 | 3 | 5 | 9 | 6 | 7 |
top | e | x | y | n | f | n | f | c | b | b | t | b | s | s | w | w | p | w | o | p | w | v | d |
freq | 4208 | 3656 | 3244 | 2284 | 4748 | 3528 | 7914 | 6812 | 5612 | 1728 | 4608 | 3776 | 5176 | 4936 | 4464 | 4384 | 8124 | 7924 | 7488 | 3968 | 2388 | 4040 | 3148 |
# Show me missing values
sum()
df.isnull().
# Option #1 for Cabin: Delete the missing datapoints for Cabin
# df.drop('Cabin', axis=1, inplace=True)
# Option #2 for Cabin: Impute the missing datapoints for Cabin via mode
# df.Age.fillna(df.Age.mode(), inplace = True)
# Impute the missing datapoints for Age
# df.Age.fillna(df.Age.mode(), inplace = True)
AttributeError: 'DataFrame' object has no attribute 'Age'
= ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-root', 'stalk-shape', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']
cat_columns = pd.get_dummies(df, columns=cat_columns)
df df.head()
class_e | class_p | cap-shape_b | cap-shape_c | cap-shape_f | cap-shape_k | cap-shape_s | cap-shape_x | cap-surface_f | cap-surface_g | cap-surface_s | cap-surface_y | cap-color_b | cap-color_c | cap-color_e | cap-color_g | cap-color_n | cap-color_p | cap-color_r | cap-color_u | cap-color_w | cap-color_y | bruises_f | bruises_t | odor_a | odor_c | odor_f | odor_l | odor_m | odor_n | odor_p | odor_s | odor_y | gill-attachment_a | gill-attachment_f | gill-spacing_c | gill-spacing_w | gill-size_b | gill-size_n | gill-color_b | gill-color_e | gill-color_g | gill-color_h | gill-color_k | gill-color_n | gill-color_o | gill-color_p | gill-color_r | gill-color_u | gill-color_w | gill-color_y | stalk-root_? | stalk-root_b | stalk-root_c | stalk-root_e | stalk-root_r | stalk-shape_e | stalk-shape_t | stalk-surface-above-ring_f | stalk-surface-above-ring_k | stalk-surface-above-ring_s | stalk-surface-above-ring_y | stalk-surface-below-ring_f | stalk-surface-below-ring_k | stalk-surface-below-ring_s | stalk-surface-below-ring_y | stalk-color-above-ring_b | stalk-color-above-ring_c | stalk-color-above-ring_e | stalk-color-above-ring_g | stalk-color-above-ring_n | stalk-color-above-ring_o | stalk-color-above-ring_p | stalk-color-above-ring_w | stalk-color-above-ring_y | stalk-color-below-ring_b | stalk-color-below-ring_c | stalk-color-below-ring_e | stalk-color-below-ring_g | stalk-color-below-ring_n | stalk-color-below-ring_o | stalk-color-below-ring_p | stalk-color-below-ring_w | stalk-color-below-ring_y | veil-type_p | veil-color_n | veil-color_o | veil-color_w | veil-color_y | ring-number_n | ring-number_o | ring-number_t | ring-type_e | ring-type_f | ring-type_l | ring-type_n | ring-type_p | spore-print-color_b | spore-print-color_h | spore-print-color_k | spore-print-color_n | spore-print-color_o | spore-print-color_r | spore-print-color_u | spore-print-color_w | spore-print-color_y | population_a | population_c | population_n | population_s | population_v | population_y | habitat_d | habitat_g | habitat_l | habitat_m | habitat_p | habitat_u | habitat_w | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
2 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
df.keys().tolist()
['class_e',
'class_p',
'cap-shape_b',
'cap-shape_c',
'cap-shape_f',
'cap-shape_k',
'cap-shape_s',
'cap-shape_x',
'cap-surface_f',
'cap-surface_g',
'cap-surface_s',
'cap-surface_y',
'cap-color_b',
'cap-color_c',
'cap-color_e',
'cap-color_g',
'cap-color_n',
'cap-color_p',
'cap-color_r',
'cap-color_u',
'cap-color_w',
'cap-color_y',
'bruises_f',
'bruises_t',
'odor_a',
'odor_c',
'odor_f',
'odor_l',
'odor_m',
'odor_n',
'odor_p',
'odor_s',
'odor_y',
'gill-attachment_a',
'gill-attachment_f',
'gill-spacing_c',
'gill-spacing_w',
'gill-size_b',
'gill-size_n',
'gill-color_b',
'gill-color_e',
'gill-color_g',
'gill-color_h',
'gill-color_k',
'gill-color_n',
'gill-color_o',
'gill-color_p',
'gill-color_r',
'gill-color_u',
'gill-color_w',
'gill-color_y',
'stalk-root_?',
'stalk-root_b',
'stalk-root_c',
'stalk-root_e',
'stalk-root_r',
'stalk-shape_e',
'stalk-shape_t',
'stalk-surface-above-ring_f',
'stalk-surface-above-ring_k',
'stalk-surface-above-ring_s',
'stalk-surface-above-ring_y',
'stalk-surface-below-ring_f',
'stalk-surface-below-ring_k',
'stalk-surface-below-ring_s',
'stalk-surface-below-ring_y',
'stalk-color-above-ring_b',
'stalk-color-above-ring_c',
'stalk-color-above-ring_e',
'stalk-color-above-ring_g',
'stalk-color-above-ring_n',
'stalk-color-above-ring_o',
'stalk-color-above-ring_p',
'stalk-color-above-ring_w',
'stalk-color-above-ring_y',
'stalk-color-below-ring_b',
'stalk-color-below-ring_c',
'stalk-color-below-ring_e',
'stalk-color-below-ring_g',
'stalk-color-below-ring_n',
'stalk-color-below-ring_o',
'stalk-color-below-ring_p',
'stalk-color-below-ring_w',
'stalk-color-below-ring_y',
'veil-type_p',
'veil-color_n',
'veil-color_o',
'veil-color_w',
'veil-color_y',
'ring-number_n',
'ring-number_o',
'ring-number_t',
'ring-type_e',
'ring-type_f',
'ring-type_l',
'ring-type_n',
'ring-type_p',
'spore-print-color_b',
'spore-print-color_h',
'spore-print-color_k',
'spore-print-color_n',
'spore-print-color_o',
'spore-print-color_r',
'spore-print-color_u',
'spore-print-color_w',
'spore-print-color_y',
'population_a',
'population_c',
'population_n',
'population_s',
'population_v',
'population_y',
'habitat_d',
'habitat_g',
'habitat_l',
'habitat_m',
'habitat_p',
'habitat_u',
'habitat_w']
= nn.tensor(df['class_p']) t_dep
= [key for key in df.keys().tolist() if 'class' not in key]
indep_cols indep_cols
['cap-shape_b',
'cap-shape_c',
'cap-shape_f',
'cap-shape_k',
'cap-shape_s',
'cap-shape_x',
'cap-surface_f',
'cap-surface_g',
'cap-surface_s',
'cap-surface_y',
'cap-color_b',
'cap-color_c',
'cap-color_e',
'cap-color_g',
'cap-color_n',
'cap-color_p',
'cap-color_r',
'cap-color_u',
'cap-color_w',
'cap-color_y',
'bruises_f',
'bruises_t',
'odor_a',
'odor_c',
'odor_f',
'odor_l',
'odor_m',
'odor_n',
'odor_p',
'odor_s',
'odor_y',
'gill-attachment_a',
'gill-attachment_f',
'gill-spacing_c',
'gill-spacing_w',
'gill-size_b',
'gill-size_n',
'gill-color_b',
'gill-color_e',
'gill-color_g',
'gill-color_h',
'gill-color_k',
'gill-color_n',
'gill-color_o',
'gill-color_p',
'gill-color_r',
'gill-color_u',
'gill-color_w',
'gill-color_y',
'stalk-root_?',
'stalk-root_b',
'stalk-root_c',
'stalk-root_e',
'stalk-root_r',
'stalk-shape_e',
'stalk-shape_t',
'stalk-surface-above-ring_f',
'stalk-surface-above-ring_k',
'stalk-surface-above-ring_s',
'stalk-surface-above-ring_y',
'stalk-surface-below-ring_f',
'stalk-surface-below-ring_k',
'stalk-surface-below-ring_s',
'stalk-surface-below-ring_y',
'stalk-color-above-ring_b',
'stalk-color-above-ring_c',
'stalk-color-above-ring_e',
'stalk-color-above-ring_g',
'stalk-color-above-ring_n',
'stalk-color-above-ring_o',
'stalk-color-above-ring_p',
'stalk-color-above-ring_w',
'stalk-color-above-ring_y',
'stalk-color-below-ring_b',
'stalk-color-below-ring_c',
'stalk-color-below-ring_e',
'stalk-color-below-ring_g',
'stalk-color-below-ring_n',
'stalk-color-below-ring_o',
'stalk-color-below-ring_p',
'stalk-color-below-ring_w',
'stalk-color-below-ring_y',
'veil-type_p',
'veil-color_n',
'veil-color_o',
'veil-color_w',
'veil-color_y',
'ring-number_n',
'ring-number_o',
'ring-number_t',
'ring-type_e',
'ring-type_f',
'ring-type_l',
'ring-type_n',
'ring-type_p',
'spore-print-color_b',
'spore-print-color_h',
'spore-print-color_k',
'spore-print-color_n',
'spore-print-color_o',
'spore-print-color_r',
'spore-print-color_u',
'spore-print-color_w',
'spore-print-color_y',
'population_a',
'population_c',
'population_n',
'population_s',
'population_v',
'population_y',
'habitat_d',
'habitat_g',
'habitat_l',
'habitat_m',
'habitat_p',
'habitat_u',
'habitat_w']
= nn.tensor(df[indep_cols].values, dtype=nn.float)
t_indep = t_indep.max(dim=0)
vals,indices = t_indep / vals t_indep
t_indep.shape
torch.Size([8124, 117])
import torch.nn.functional as F
442)
nn.manual_seed(
# def calc_preds(coeffs, indeps):
# return nn.sigmoid((indeps*coeffs).sum(axis=1))
def calc_preds(coeffs, indeps):
= coeffs
l1,l2,const = F.relu(indeps@l1)
res = res@l2 + const
res return nn.sigmoid(res)
def calc_loss(coeffs, indeps, deps):
# calculate the predictions - mean
# mean absolute error
return nn.abs(calc_preds(coeffs, indeps)-deps).mean()
def init_coeffs(n_hidden=20):
= t_indep.shape[1]
n_coeff = (nn.rand(n_coeff, n_hidden)-0.5)/n_hidden
layer1 = nn.rand(n_hidden, 1)-0.3
layer2 = nn.rand(1)[0]
const return layer1.requires_grad_(),layer2.requires_grad_(),const.requires_grad_()
# def init_weights():
# n_coeff = t_indep.shape[1]
# coeffs = nn.rand(n_coeff)-0.5
# return coeffs.requires_grad_()
from fastai.data.transforms import RandomSplitter
=RandomSplitter(seed=42)(df)
trn_split,val_split
= t_indep[trn_split],t_indep[val_split]
trn_indep,val_indep = t_dep[trn_split],t_dep[val_split]
trn_dep,val_dep len(trn_indep),len(val_indep)
(6500, 1624)
def update_coeffs(coeffs, lr):
for layer in coeffs:
* lr)
layer.sub_(layer.grad
layer.grad.zero_()
def one_epoch(coeffs, lr):
= calc_loss(coeffs, trn_indep, trn_dep)
loss
loss.backward()with nn.no_grad(): update_coeffs(coeffs, lr)
print(f"{loss:.3f}", end="; ")
def train_model(epochs=30, lr=0.01):
442)
nn.manual_seed(= init_coeffs()
coeffs for i in range(epochs): one_epoch(coeffs, lr=lr)
return coeffs
= train_model(18, lr=0.2) coeffs
0.507; 0.507; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506;
def show_coeffs(): return dict(zip(indep_cols, coeffs.requires_grad_(False)))
show_coeffs()
AttributeError: 'tuple' object has no attribute 'requires_grad_'
def acc(coeffs):
return (val_dep.bool()==(calc_preds(coeffs, val_indep)>0.5)).float().mean()
acc(coeffs)
tensor(0.4723)