Candid Code – deep-learning-basics

import pandas as pd
import torch as nn
import numpy as np
from pathlib import Path
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

path = Path('../data/')
path.ls()

df = pd.read_csv(path/'mushrooms.csv')

df.describe()

	class	cap-shape	cap-surface	cap-color	bruises	odor	gill-attachment	gill-spacing	gill-size	gill-color	stalk-shape	stalk-root	stalk-surface-above-ring	stalk-surface-below-ring	stalk-color-above-ring	stalk-color-below-ring	veil-type	veil-color	ring-number	ring-type	spore-print-color	population	habitat
count	8124	8124	8124	8124	8124	8124	8124	8124	8124	8124	8124	8124	8124	8124	8124	8124	8124	8124	8124	8124	8124	8124	8124
unique	2	6	4	10	2	9	2	2	2	12	2	5	4	4	9	9	1	4	3	5	9	6	7
top	e	x	y	n	f	n	f	c	b	b	t	b	s	s	w	w	p	w	o	p	w	v	d
freq	4208	3656	3244	2284	4748	3528	7914	6812	5612	1728	4608	3776	5176	4936	4464	4384	8124	7924	7488	3968	2388	4040	3148

# Show me missing values
df.isnull().sum()

# Option #1 for Cabin: Delete the missing datapoints for Cabin
# df.drop('Cabin', axis=1, inplace=True)

# Option #2 for Cabin: Impute the missing datapoints for Cabin via mode
# df.Age.fillna(df.Age.mode(), inplace = True)

# Impute the missing datapoints for Age
# df.Age.fillna(df.Age.mode(), inplace = True)

AttributeError: 'DataFrame' object has no attribute 'Age'

cat_columns = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-root', 'stalk-shape', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']
df = pd.get_dummies(df, columns=cat_columns)
df.head()

	class_e	class_p	cap-shape_b	cap-shape_x	cap-surface_s	cap-surface_y	cap-color_g	cap-color_n	cap-color_w	cap-color_y	bruises_f	bruises_t	odor_a	odor_l	odor_n	odor_p	gill-attachment_f	gill-spacing_c	gill-spacing_w	gill-size_b	gill-size_n	gill-color_k	gill-color_n	stalk-root_c	stalk-root_e	stalk-shape_e	stalk-shape_t	stalk-surface-above-ring_s	stalk-surface-below-ring_s	stalk-color-above-ring_w	stalk-color-below-ring_w	veil-type_p	veil-color_w	ring-number_o	ring-type_e	ring-type_p	spore-print-color_k	spore-print-color_n	population_a	population_n	population_s	habitat_g	habitat_m	habitat_u
0	0	1	0	1	1	0	0	1	0	0	0	1	0	0	0	1	1	1	0	0	1	1	0	0	1	1	0	1	1	1	1	1	1	1	0	1	1	0	0	0	1	0	0	1
1	1	0	0	1	1	0	0	0	0	1	0	1	1	0	0	0	1	1	0	1	0	1	0	1	0	1	0	1	1	1	1	1	1	1	0	1	0	1	0	1	0	1	0	0
2	1	0	1	0	1	0	0	0	1	0	0	1	0	1	0	0	1	1	0	1	0	0	1	1	0	1	0	1	1	1	1	1	1	1	0	1	0	1	0	1	0	0	1	0
3	0	1	0	1	0	1	0	0	1	0	0	1	0	0	0	1	1	1	0	0	1	0	1	0	1	1	0	1	1	1	1	1	1	1	0	1	1	0	0	0	1	0	0	1
4	1	0	0	1	1	0	1	0	0	0	1	0	0	0	1	0	1	0	1	1	0	1	0	0	1	0	1	1	1	1	1	1	1	1	1	0	0	1	1	0	0	1	0	0

df.keys().tolist()

['class_e',
 'class_p',
 'cap-shape_b',
 'cap-shape_c',
 'cap-shape_f',
 'cap-shape_k',
 'cap-shape_s',
 'cap-shape_x',
 'cap-surface_f',
 'cap-surface_g',
 'cap-surface_s',
 'cap-surface_y',
 'cap-color_b',
 'cap-color_c',
 'cap-color_e',
 'cap-color_g',
 'cap-color_n',
 'cap-color_p',
 'cap-color_r',
 'cap-color_u',
 'cap-color_w',
 'cap-color_y',
 'bruises_f',
 'bruises_t',
 'odor_a',
 'odor_c',
 'odor_f',
 'odor_l',
 'odor_m',
 'odor_n',
 'odor_p',
 'odor_s',
 'odor_y',
 'gill-attachment_a',
 'gill-attachment_f',
 'gill-spacing_c',
 'gill-spacing_w',
 'gill-size_b',
 'gill-size_n',
 'gill-color_b',
 'gill-color_e',
 'gill-color_g',
 'gill-color_h',
 'gill-color_k',
 'gill-color_n',
 'gill-color_o',
 'gill-color_p',
 'gill-color_r',
 'gill-color_u',
 'gill-color_w',
 'gill-color_y',
 'stalk-root_?',
 'stalk-root_b',
 'stalk-root_c',
 'stalk-root_e',
 'stalk-root_r',
 'stalk-shape_e',
 'stalk-shape_t',
 'stalk-surface-above-ring_f',
 'stalk-surface-above-ring_k',
 'stalk-surface-above-ring_s',
 'stalk-surface-above-ring_y',
 'stalk-surface-below-ring_f',
 'stalk-surface-below-ring_k',
 'stalk-surface-below-ring_s',
 'stalk-surface-below-ring_y',
 'stalk-color-above-ring_b',
 'stalk-color-above-ring_c',
 'stalk-color-above-ring_e',
 'stalk-color-above-ring_g',
 'stalk-color-above-ring_n',
 'stalk-color-above-ring_o',
 'stalk-color-above-ring_p',
 'stalk-color-above-ring_w',
 'stalk-color-above-ring_y',
 'stalk-color-below-ring_b',
 'stalk-color-below-ring_c',
 'stalk-color-below-ring_e',
 'stalk-color-below-ring_g',
 'stalk-color-below-ring_n',
 'stalk-color-below-ring_o',
 'stalk-color-below-ring_p',
 'stalk-color-below-ring_w',
 'stalk-color-below-ring_y',
 'veil-type_p',
 'veil-color_n',
 'veil-color_o',
 'veil-color_w',
 'veil-color_y',
 'ring-number_n',
 'ring-number_o',
 'ring-number_t',
 'ring-type_e',
 'ring-type_f',
 'ring-type_l',
 'ring-type_n',
 'ring-type_p',
 'spore-print-color_b',
 'spore-print-color_h',
 'spore-print-color_k',
 'spore-print-color_n',
 'spore-print-color_o',
 'spore-print-color_r',
 'spore-print-color_u',
 'spore-print-color_w',
 'spore-print-color_y',
 'population_a',
 'population_c',
 'population_n',
 'population_s',
 'population_v',
 'population_y',
 'habitat_d',
 'habitat_g',
 'habitat_l',
 'habitat_m',
 'habitat_p',
 'habitat_u',
 'habitat_w']

t_dep = nn.tensor(df['class_p'])

indep_cols = [key for key in df.keys().tolist() if 'class' not in key]
indep_cols

['cap-shape_b',
 'cap-shape_c',
 'cap-shape_f',
 'cap-shape_k',
 'cap-shape_s',
 'cap-shape_x',
 'cap-surface_f',
 'cap-surface_g',
 'cap-surface_s',
 'cap-surface_y',
 'cap-color_b',
 'cap-color_c',
 'cap-color_e',
 'cap-color_g',
 'cap-color_n',
 'cap-color_p',
 'cap-color_r',
 'cap-color_u',
 'cap-color_w',
 'cap-color_y',
 'bruises_f',
 'bruises_t',
 'odor_a',
 'odor_c',
 'odor_f',
 'odor_l',
 'odor_m',
 'odor_n',
 'odor_p',
 'odor_s',
 'odor_y',
 'gill-attachment_a',
 'gill-attachment_f',
 'gill-spacing_c',
 'gill-spacing_w',
 'gill-size_b',
 'gill-size_n',
 'gill-color_b',
 'gill-color_e',
 'gill-color_g',
 'gill-color_h',
 'gill-color_k',
 'gill-color_n',
 'gill-color_o',
 'gill-color_p',
 'gill-color_r',
 'gill-color_u',
 'gill-color_w',
 'gill-color_y',
 'stalk-root_?',
 'stalk-root_b',
 'stalk-root_c',
 'stalk-root_e',
 'stalk-root_r',
 'stalk-shape_e',
 'stalk-shape_t',
 'stalk-surface-above-ring_f',
 'stalk-surface-above-ring_k',
 'stalk-surface-above-ring_s',
 'stalk-surface-above-ring_y',
 'stalk-surface-below-ring_f',
 'stalk-surface-below-ring_k',
 'stalk-surface-below-ring_s',
 'stalk-surface-below-ring_y',
 'stalk-color-above-ring_b',
 'stalk-color-above-ring_c',
 'stalk-color-above-ring_e',
 'stalk-color-above-ring_g',
 'stalk-color-above-ring_n',
 'stalk-color-above-ring_o',
 'stalk-color-above-ring_p',
 'stalk-color-above-ring_w',
 'stalk-color-above-ring_y',
 'stalk-color-below-ring_b',
 'stalk-color-below-ring_c',
 'stalk-color-below-ring_e',
 'stalk-color-below-ring_g',
 'stalk-color-below-ring_n',
 'stalk-color-below-ring_o',
 'stalk-color-below-ring_p',
 'stalk-color-below-ring_w',
 'stalk-color-below-ring_y',
 'veil-type_p',
 'veil-color_n',
 'veil-color_o',
 'veil-color_w',
 'veil-color_y',
 'ring-number_n',
 'ring-number_o',
 'ring-number_t',
 'ring-type_e',
 'ring-type_f',
 'ring-type_l',
 'ring-type_n',
 'ring-type_p',
 'spore-print-color_b',
 'spore-print-color_h',
 'spore-print-color_k',
 'spore-print-color_n',
 'spore-print-color_o',
 'spore-print-color_r',
 'spore-print-color_u',
 'spore-print-color_w',
 'spore-print-color_y',
 'population_a',
 'population_c',
 'population_n',
 'population_s',
 'population_v',
 'population_y',
 'habitat_d',
 'habitat_g',
 'habitat_l',
 'habitat_m',
 'habitat_p',
 'habitat_u',
 'habitat_w']

t_indep = nn.tensor(df[indep_cols].values, dtype=nn.float)
vals,indices = t_indep.max(dim=0)
t_indep = t_indep / vals

t_indep.shape

torch.Size([8124, 117])

import torch.nn.functional as F
nn.manual_seed(442)

# def calc_preds(coeffs, indeps): 
#   return nn.sigmoid((indeps*coeffs).sum(axis=1))

def calc_preds(coeffs, indeps):
    l1,l2,const = coeffs
    res = F.relu(indeps@l1)
    res = res@l2 + const
    return nn.sigmoid(res)

def calc_loss(coeffs, indeps, deps): 
  # calculate the predictions - mean
  # mean absolute error
  return nn.abs(calc_preds(coeffs, indeps)-deps).mean()

def init_coeffs(n_hidden=20):
    n_coeff = t_indep.shape[1]
    layer1 = (nn.rand(n_coeff, n_hidden)-0.5)/n_hidden
    layer2 = nn.rand(n_hidden, 1)-0.3
    const = nn.rand(1)[0]
    return layer1.requires_grad_(),layer2.requires_grad_(),const.requires_grad_()
# def init_weights():
#   n_coeff = t_indep.shape[1]
#   coeffs = nn.rand(n_coeff)-0.5
#   return coeffs.requires_grad_()

from fastai.data.transforms import RandomSplitter
trn_split,val_split=RandomSplitter(seed=42)(df)

trn_indep,val_indep = t_indep[trn_split],t_indep[val_split]
trn_dep,val_dep = t_dep[trn_split],t_dep[val_split]
len(trn_indep),len(val_indep)

(6500, 1624)

def update_coeffs(coeffs, lr):
    for layer in coeffs:
        layer.sub_(layer.grad * lr)
        layer.grad.zero_()

def one_epoch(coeffs, lr):
    loss = calc_loss(coeffs, trn_indep, trn_dep)
    loss.backward()
    with nn.no_grad(): update_coeffs(coeffs, lr)
    print(f"{loss:.3f}", end="; ")

def train_model(epochs=30, lr=0.01):
    nn.manual_seed(442)
    coeffs = init_coeffs()
    for i in range(epochs): one_epoch(coeffs, lr=lr)
    return coeffs

coeffs = train_model(18, lr=0.2)

0.507; 0.507; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506; 0.506;

def show_coeffs(): return dict(zip(indep_cols, coeffs.requires_grad_(False)))
show_coeffs()

AttributeError: 'tuple' object has no attribute 'requires_grad_'

def acc(coeffs): 
  return (val_dep.bool()==(calc_preds(coeffs, val_indep)>0.5)).float().mean()

acc(coeffs)

tensor(0.4723)