import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from scipy import stats
# Cause plots to be displayed in the notebook:
%pylab inline
%matplotlib inline

Populating the interactive namespace from numpy and matplotlib


from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve, auc


df=pd.read_csv('../mushroom_UCI_data/agaricus-lepiota.data')


df.describe()


columns=['class','cap_shape', 'cap_surface', 'cap_color', 'bruises', 'odor','gill_attachment',          
'gill_spacing','gill_size','gill_color','stalk_shape','stalk_root','stalk_surface_above_ring' ,'stalk_surface_below_ring', 
'stalk_color_above_ring','stalk_color_below_ring' ,'veil_type','veil_color','ring_number','ring_type','spore_print_color',
'population','habitat']


df.columns=columns


df.head()


df.stalk_root.unique()

array(['c', 'e', 'b', 'r', '?'], dtype=object)


 # Check for unique values in each column
 # option to convert to dictionary if nessesary 
# feature={}
for col in df.columns:
    print(col, ':', df[col].unique())

class : ['e' 'p']
cap_shape : ['x' 'b' 's' 'f' 'k' 'c']
cap_surface : ['s' 'y' 'f' 'g']
cap_color : ['y' 'w' 'g' 'n' 'e' 'p' 'b' 'u' 'c' 'r']
bruises : ['t' 'f']
odor : ['a' 'l' 'p' 'n' 'f' 'c' 'y' 's' 'm']
gill_attachment : ['f' 'a']
gill_spacing : ['c' 'w']
gill_size : ['b' 'n']
gill_color : ['k' 'n' 'g' 'p' 'w' 'h' 'u' 'e' 'b' 'r' 'y' 'o']
stalk_shape : ['e' 't']
stalk_root : ['c' 'e' 'b' 'r' '?']
stalk_surface_above_ring : ['s' 'f' 'k' 'y']
stalk_surface_below_ring : ['s' 'f' 'y' 'k']
stalk_color_above_ring : ['w' 'g' 'p' 'n' 'b' 'e' 'o' 'c' 'y']
stalk_color_below_ring : ['w' 'p' 'g' 'b' 'n' 'e' 'y' 'o' 'c']
veil_type : ['p']
veil_color : ['w' 'n' 'o' 'y']
ring_number : ['o' 't' 'n']
ring_type : ['p' 'e' 'l' 'f' 'n']
spore_print_color : ['n' 'k' 'u' 'h' 'w' 'r' 'o' 'y' 'b']
population : ['n' 's' 'a' 'v' 'y' 'c']
habitat : ['g' 'm' 'u' 'd' 'p' 'w' 'l']


 # Check distribution of the target variable
sns.countplot(x='class', data=df)
plt.show()


sns.countplot(x='odor', hue='class', data=df)

<AxesSubplot:xlabel='odor', ylabel='count'>


# Check correlation between variables
# sns.heatmap(df.corr(), cmap='coolwarm', annot=True)
# plt.show()


 # Check distribution of each categorical feature
# for col in df.columns[:-1]:
#     sns.countplot(x=col, hue='class', data=df)
 #    plt.show()
rows,cols =5,5 
fig, axes = plt.subplots(rows, cols, figsize=(12,12))
k=0
for i in range(rows): 
    for j in range(cols):
        mycol=df.columns[k]
        # sns.violinplot(ax=axes[i,j],data=df.drop(['id','Unnamed: 32'],axis=1),x='diagnosis',y=mycol)
        sns.countplot(ax=axes[i,j], x=mycol, hue='class', data=df)
        # axes[i,j].set_xlabel(None)
        legend_handles, _= axes[i,j].get_legend_handles_labels()
        axes[i,j].get_legend().remove()  # use this is legend=None does not work  
        

        k+=1
#         print(k)
        if k == len(df.columns): break

plt.subplots_adjust(left=0.1,
                    bottom=0.1,
                    right=0.9,
                    top=0.9,
                    wspace=0.6,
                    hspace=0.5)
axes[4, 4].remove()  # remove unused subplot
axes[4, 3].remove()  # remove unused subplot
plt.legend(legend_handles,['edible','poison'],loc='lower right',bbox_to_anchor=(1.9,0.35))

<matplotlib.legend.Legend at 0x1815ef9d0>


# print(df['class'].value_counts())
print("number of edible: %i proportion (bassline accuracy): %.3f " % (df['class'].value_counts()[0],df['class'].value_counts()[0]/len(df)) )
print("number of poisionous: %i proportion: %.3f " % (df['class'].value_counts()[1],df['class'].value_counts()[1]/len(df)) )

number of edible: 4208 proportion (bassline accuracy): 0.518 
number of poisionous: 3915 proportion: 0.482


# P_1) odor=NOT(almond.OR.anise.OR.none)
#   120 poisonous cases missed, 98.52% accuracy

# print (sum(((df['odor'] == 'l') | (df['odor'] == 'a') | (df['odor'] == 'n'))))
vc1=df[((df['odor'] == 'l') | (df['odor'] == 'a') | (df['odor'] == 'n'))]['class'].value_counts()
vc2=df[-((df['odor'] == 'l') | (df['odor'] == 'a') | (df['odor'] == 'n'))]['class'].value_counts()
print(vc1)
tn,fn=vc1
tp=vc2[0]
fp=0
print(tn,fp)
print(fn,tp)
print("accuracy of logical rule P_1: %.4f " %((4208+3795)/len(df)) )
print("recall/sensitivity:  %.4f"%(tp/(tp+fn)) )
print("precision:  %.4f"%(tp/(tp+fp)) )
print("f1-score (harmonic mean of precision and recall ):  %.4f"%((2*tp)/(2*tp+fn+fp)) )

e    4208
p     120
Name: class, dtype: int64
4208 0
120 3795
accuracy of logical rule P_1: 0.9852 
recall/sensitivity:  0.9693
precision:  1.0000
f1-score (harmonic mean of precision and recall ):  0.9844


#  P_2) spore-print-color=green 48 cases missed, 99.41% accuracy
P_1=df[((df['odor'] == 'l') | (df['odor'] == 'a') | (df['odor'] == 'n'))]
p_2=P_1[ P_1['spore_print_color']=='r' ]['class'].value_counts()[0] #  this grabs 72 of the 120  
tp=tp+p_2
fn=fn-p_2
print(tn,fp)
print(fn,tp)
print("accuracy of logical rule P_1: %.4f " %((4208+3795)/len(df)) )
print("recall/sensitivity:  %.4f"%(tp/(tp+fn)) )
print("precision:  %.4f"%(tp/(tp+fp)) )
print("f1-score (harmonic mean of precision and recall ):  %.4f"%((2*tp)/(2*tp+fn+fp)) )

4208 0
48 3939
accuracy of logical rule P_1: 0.9852 
recall/sensitivity:  0.9880
precision:  1.0000
f1-score (harmonic mean of precision and recall ):  0.9939


# Using LabelEncoder to convert catergory values to nominal 
from sklearn.preprocessing import LabelEncoder
df_encoded=df.copy()
labelencoder=LabelEncoder()
for column in df.columns:
    df_encoded[column] = labelencoder.fit_transform(df_encoded[column])

df_encoded.head()


# Using OrdinalEncoder to convert catergory values to Ordinal
from sklearn.preprocessing import OrdinalEncoder
df_oencoded=df.copy()
print(df_oencoded['odor'].head())
odor_order=[['n'],['a'],['l'],['m'],['c'],['p'],['s'],['y'],['f']]
Ordinalencoder=OrdinalEncoder(categories=[['n','a','l','m','c','p','s','y','f']])
Ordinalencoder.fit(odor_order)
print(Ordinalencoder.categories_)
test_encoded = Ordinalencoder.transform(df_oencoded['odor'].values.reshape(-1,1))
df_oencoded['odor']=test_encoded.flatten().astype(int)
df_oencoded['odor'].head()

0    a
1    l
2    p
3    n
4    a
Name: odor, dtype: object
[array(['n', 'a', 'l', 'm', 'c', 'p', 's', 'y', 'f'], dtype=object)]

0    1
1    2
2    5
3    0
4    1
Name: odor, dtype: int64


df_oencoded


# this then does the nominal encoding in pandas and skips the odor col
for col in df_oencoded.columns:
    if not col=='odor': df_oencoded[col]=df_oencoded[col].astype("category").cat.codes


# this does the nominal encoding in pandas 
df_encoded=df.copy()
for col in df.columns:
    df_encoded[col]=df_encoded[col].astype("category").cat.codes


df_encoded.head()


for col in df_encoded.columns:
    print(col, ':', df_encoded[col].unique())

class : [0 1]
cap_shape : [5 0 4 2 3 1]
cap_surface : [2 3 0 1]
cap_color : [9 8 3 4 2 5 0 7 1 6]
bruises : [1 0]
odor : [0 3 6 5 2 1 8 7 4]
gill_attachment : [1 0]
gill_spacing : [0 1]
gill_size : [0 1]
gill_color : [ 4  5  2  7 10  3  9  1  0  8 11  6]
stalk_shape : [0 1]
stalk_root : [2 3 1 4 0]
stalk_surface_above_ring : [2 0 1 3]
stalk_surface_below_ring : [2 0 3 1]
stalk_color_above_ring : [7 3 6 4 0 2 5 1 8]
stalk_color_below_ring : [7 6 3 0 4 2 8 5 1]
veil_type : [0]
veil_color : [2 0 1 3]
ring_number : [1 2 0]
ring_type : [4 0 2 1 3]
spore_print_color : [3 2 6 1 7 5 4 8 0]
population : [2 3 0 4 5 1]
habitat : [1 3 5 0 4 6 2]


# nominal encoding is alphanumeric
sns.countplot(x='odor', hue='class', data=df_encoded)

<AxesSubplot:xlabel='odor', ylabel='count'>


sns.countplot(x='odor', hue='class', data=df)

<AxesSubplot:xlabel='odor', ylabel='count'>


# ordianal encoding for odor 
sns.countplot(x='odor', hue='class', data=df_oencoded)

<AxesSubplot:xlabel='odor', ylabel='count'>


# correlation for nominal - these are meaningless but want to compare with odor ordinal
df_encoded.corr().iloc[0]

class                       1.000000
cap_shape                   0.052826
cap_surface                 0.178440
cap_color                  -0.031361
bruises                    -0.501758
odor                       -0.093675
gill_attachment             0.129188
gill_spacing               -0.348358
gill_size                   0.539944
gill_color                 -0.530574
stalk_shape                -0.101888
stalk_root                 -0.379688
stalk_surface_above_ring   -0.334712
stalk_surface_below_ring   -0.298901
stalk_color_above_ring     -0.154096
stalk_color_below_ring     -0.146824
veil_type                        NaN
veil_color                  0.145133
ring_number                -0.214349
ring_type                  -0.411942
spore_print_color           0.172063
population                  0.298776
habitat                     0.216990
Name: class, dtype: float64


# correlation for nominal - these are meaningless but want to compare with odor ordinal
df_oencoded.corr().iloc[0]

class                       1.000000
cap_shape                   0.052826
cap_surface                 0.178440
cap_color                  -0.031361
bruises                    -0.501758
odor                        0.931021
gill_attachment             0.129188
gill_spacing               -0.348358
gill_size                   0.539944
gill_color                 -0.530574
stalk_shape                -0.101888
stalk_root                 -0.379688
stalk_surface_above_ring   -0.334712
stalk_surface_below_ring   -0.298901
stalk_color_above_ring     -0.154096
stalk_color_below_ring     -0.146824
veil_type                        NaN
veil_color                  0.145133
ring_number                -0.214349
ring_type                  -0.411942
spore_print_color           0.172063
population                  0.298776
habitat                     0.216990
Name: class, dtype: float64


# sns.pairplot(data=df_encoded.iloc[:,:4],hue='class')
# variable for pairplot selected from the simple logical rules
# sns.pairplot(data=df_encoded[['class','odor','spore_print_color','stalk_surface_below_ring','stalk_surface_above_ring','population','habitat','cap_color','gill_size']],hue='class')
sns.pairplot(data=df_encoded[['class','odor','spore_print_color','stalk_surface_below_ring','stalk_color_above_ring','gill_size']],hue='class')

<seaborn.axisgrid.PairGrid at 0x14adbf990>


# predict test class:
X=df_encoded.drop(['class','veil_type'],axis=1).values
y=df_encoded['class'].values  # 1 is for poisonous 
xcols=df_encoded.drop(['class','veil_type'],axis=1).columns
xcols

Index(['cap_shape', 'cap_surface', 'cap_color', 'bruises', 'odor',
       'gill_attachment', 'gill_spacing', 'gill_size', 'gill_color',
       'stalk_shape', 'stalk_root', 'stalk_surface_above_ring',
       'stalk_surface_below_ring', 'stalk_color_above_ring',
       'stalk_color_below_ring', 'veil_color', 'ring_number', 'ring_type',
       'spore_print_color', 'population', 'habitat'],
      dtype='object')


X[:,3:6]

array([[1, 0, 1],
       [1, 3, 1],
       [1, 6, 1],
       ...,
       [0, 5, 0],
       [0, 8, 1],
       [0, 5, 0]], dtype=int8)


# predict test class:
Xo=df_oencoded.drop(['class','veil_type'],axis=1).values
yo=df_oencoded['class'].values  # 1 is for poisonous


Xo[:,3:6]

array([[1, 1, 1],
       [1, 2, 1],
       [1, 5, 1],
       ...,
       [0, 0, 0],
       [0, 7, 1],
       [0, 0, 0]])


# doing an L2 model
X_train, X_test, y_train, y_test = train_test_split(Xo, yo, test_size = 0.2, random_state = 1)
# Build Model

clf=LogisticRegression(random_state=42,max_iter=3000,solver='liblinear',penalty='l2')
# Fit Model
clf.fit(X_train,y_train)
# Score
print("accuracy: ",clf.score(X_test,y_test))
y_pred=clf.predict(X_test)
print("this is the default for confusion matrix()")
print(np.asarray([['TN', 'FP'], ['FN', 'TP']]))
print("confusion matrix: ")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
# predicted test probability:
# preds = clf.predict_proba(X_test)[:,1]

accuracy:  0.9907692307692307
this is the default for confusion matrix()
[['TN' 'FP']
 ['FN' 'TP']]
confusion matrix: 
[[796   0]
 [ 15 814]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       796
           1       1.00      0.98      0.99       829

    accuracy                           0.99      1625
   macro avg       0.99      0.99      0.99      1625
weighted avg       0.99      0.99      0.99      1625


columns=df_encoded.drop(['class','veil_type'],axis=1).columns
ax=sns.barplot(x=columns, y=clf.coef_[0],palette='tab10',label='l2 regularization') 
ax=plt.xticks(rotation=90)
ax=plt.legend()


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
clf=LogisticRegression(random_state=42,max_iter=3000,solver='liblinear',penalty='l2')
rfe = RFE(estimator=clf, n_features_to_select=4, step=1)
rfe.fit(X_train,y_train)
rfe.support_
print(rfe.score(X_test,y_test))
xcols[rfe.support_]

0.936

Index(['gill_spacing', 'gill_size', 'stalk_surface_above_ring', 'veil_color'], dtype='object')


X_train, X_test, y_train, y_test = train_test_split(Xo, yo, test_size = 0.2, random_state = 1)
clf=LogisticRegression(random_state=42,max_iter=3000,solver='liblinear',penalty='l2')
rfe = RFE(estimator=clf, n_features_to_select=4, step=1)
rfe.fit(X_train,y_train)
rfe.support_
print(rfe.score(X_test,y_test))
xcols[rfe.support_]

0.932923076923077

Index(['gill_attachment', 'gill_spacing', 'gill_size',
       'stalk_surface_above_ring'],
      dtype='object')


# doing an L1 model
X_train, X_test, y_train, y_test = train_test_split(Xo, yo, test_size = 0.2, random_state = 1)
# Build Model

clf=LogisticRegression(random_state=42,max_iter=3000,solver='liblinear',penalty='l1')
# Fit Model
clf.fit(X_train,y_train)
# Score
print("accuracy: ",clf.score(X_test,y_test))
y_pred=clf.predict(X_test)
print("this is the default for confusion matrix()")
print(np.asarray([['TN', 'FP'], ['FN', 'TP']]))
print("confusion matrix: ")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
# predicted test probability:
# preds = clf.predict_proba(X_test)[:,1]

accuracy:  0.9932307692307693
this is the default for confusion matrix()
[['TN' 'FP']
 ['FN' 'TP']]
confusion matrix: 
[[796   0]
 [ 11 818]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       796
           1       1.00      0.99      0.99       829

    accuracy                           0.99      1625
   macro avg       0.99      0.99      0.99      1625
weighted avg       0.99      0.99      0.99      1625


ax=sns.barplot(x=df_encoded.drop(['class','veil_type'],axis=1).columns, y=clf.coef_[0],palette='tab10',label='l1 regularization') 
ax=plt.xticks(rotation=90)
ax=plt.legend()


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
clf=LogisticRegression(random_state=42,max_iter=3000,solver='liblinear',penalty='l1')
rfe = RFE(estimator=clf, n_features_to_select=5, step=1)
rfe.fit(X_train,y_train)
rfe.support_
print(rfe.score(X_test,y_test))
# columns[rfe.support_]

0.936

Index(['gill_spacing', 'gill_size', 'stalk_shape', 'stalk_surface_above_ring',
       'veil_color'],
      dtype='object')


xcols[rfe.support_]

Index(['gill_spacing', 'gill_size', 'stalk_shape', 'stalk_surface_above_ring',
       'veil_color'],
      dtype='object')


X_train, X_test, y_train, y_test = train_test_split(Xo, yo, test_size = 0.2, random_state = 1)
clf=LogisticRegression(random_state=42,max_iter=3000,solver='liblinear',penalty='l1')
rfe = RFE(estimator=clf, n_features_to_select=5, step=1)
rfe.fit(X_train,y_train)
rfe.support_
print(rfe.score(X_test,y_test))
xcols[rfe.support_]

0.9372307692307692

Index(['gill_attachment', 'gill_spacing', 'gill_size',
       'stalk_surface_above_ring', 'veil_color'],
      dtype='object')


from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

min_features_to_select = 1  # Minimum number of features to consider
clf = LogisticRegression(random_state=42,max_iter=3000,solver='liblinear',penalty='l1')
cv = StratifiedKFold(5)

rfecv = RFECV(
    estimator=clf,
    step=1,
    cv=cv,
    scoring="accuracy",
    min_features_to_select=min_features_to_select,
    n_jobs=2,
)
rfecv.fit(X, y)

print(f"Optimal number of features: {rfecv.n_features_}")

Optimal number of features: 13


n_scores = len(rfecv.cv_results_["mean_test_score"])
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Mean test accuracy")
plt.errorbar(
    range(min_features_to_select, n_scores + min_features_to_select),
    rfecv.cv_results_["mean_test_score"],
    yerr=rfecv.cv_results_["std_test_score"],
)
plt.title("Recursive Feature Elimination \nwith correlated features")
plt.show()


print(rfecv.ranking_)
columns[rfecv.support_]

[9 1 8 4 1 1 1 1 5 1 1 1 1 3 7 1 1 1 2 1 6]

Index(['cap_surface', 'odor', 'gill_attachment', 'gill_spacing', 'gill_size',
       'stalk_shape', 'stalk_root', 'stalk_surface_above_ring',
       'stalk_surface_below_ring', 'veil_color', 'ring_number', 'ring_type',
       'population'],
      dtype='object')


X_train, X_test, y_train, y_test = train_test_split(Xo, yo, test_size = 0.2, random_state = 1)
# clf = make_pipeline(StandardScaler(), SVC(kernel='linear',gamma='auto'))

# should be able to use regularization on the SVC 

clf=SVC(C=10, kernel='linear',gamma='auto') # the c=10 came from the GridSearchCV
# clf = SVC(random_state=42, gamma="auto")
clf.fit(X_train, y_train)
# clf.score(X_test,y_test)

print("accuracy: ",clf.score(X_test,y_test))
y_pred=clf.predict(X_test)
print("this is the default for confusion matrix()")
print(np.asarray([['TN', 'FP'], ['FN', 'TP']]))
print("confusion matrix: ")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

accuracy:  1.0
this is the default for confusion matrix()
[['TN' 'FP']
 ['FN' 'TP']]
confusion matrix: 
[[796   0]
 [  0 829]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       796
           1       1.00      1.00      1.00       829

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625


clf.class_weight_
clf.coef_

array([[ 1.43423231e-03,  1.36641304e-02, -3.54097819e-02,
         2.09405396e+00, -3.55782306e-01, -1.92366895e+00,
        -1.15727034e+01,  1.06382144e+01, -7.37159763e-03,
        -3.35801530e+00, -5.82849336e+00, -5.43355806e+00,
        -7.68957388e-02,  1.15496009e-03,  3.51232264e-03,
         2.32457632e+01, -1.13532652e+00,  2.65873849e+00,
        -1.49069147e-01, -4.28305903e-01,  4.55364269e-02]])


ax=sns.barplot(x=df_encoded.drop(['class','veil_type'],axis=1).columns, y=clf.coef_[0],palette='tab10',label='SVM kernal=linear l2') 
ax=plt.xticks(rotation=90)
ax=plt.legend()


X_train, X_test, y_train, y_test = train_test_split(Xo, yo, test_size = 0.2, random_state = 1)
# clf = make_pipeline(StandardScaler(), SVC(kernel='linear',gamma='auto'))

# should be able to use regularization on the SVC 

clf=SVC(kernel='rbf',gamma='auto')
# clf = SVC(random_state=42, gamma="auto")
clf.fit(X_train, y_train)
# clf.score(X_test,y_test)

print("accuracy: ",clf.score(X_test,y_test))
y_pred=clf.predict(X_test)
print("this is the default for confusion matrix()")
print(np.asarray([['TN', 'FP'], ['FN', 'TP']]))
print("confusion matrix: ")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

accuracy:  1.0
this is the default for confusion matrix()
[['TN' 'FP']
 ['FN' 'TP']]
confusion matrix: 
[[796   0]
 [  0 829]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       796
           1       1.00      1.00      1.00       829

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625


clf.class_weight_

array([1., 1.])


from sklearn.pipeline import Pipeline

# steps = [('scaler', StandardScaler()),
#          ('SVM', SVC(probability=True) )]


# steps = [('SVM', SVC(gamma='auto') )]
svc=SVC()
#
# There was a trick with specifying params when using pipeline and Grid Search
# note the SVC__ which 

# svc_params = {
#     'SVM__C': [1, 10, 100],
#     'SVM__gamma': [0.001, 0.0001,'auto'],
#     'SVM__kernel': ['linear','rbf']
# }

svc_params = {
    'C': [1, 10, ],
    'kernel': ['linear','rbf']
}

# svc_pipeline = Pipeline(steps)
SVCclf = GridSearchCV(svc, svc_params)
SVCclf.fit(X, y)

GridSearchCV(estimator=SVC(),
             param_grid={'C': [1, 10], 'kernel': ['linear', 'rbf']})


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
print(SVCclf.best_estimator_)
print(SVCclf.best_score_)
print(SVCclf.best_params_)
print("\n")
print("accuracy: ",SVCclf.best_score_)
SVCclf.best_estimator_.fit(X_train,y_train)
y_pred=SVCclf.best_estimator_.predict(X_test)
print("this is the default for confusion matrix()")
print(np.asarray([['TN', 'FP'], ['FN', 'TP']]))
print("confusion matrix: ")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

SVC(C=10, kernel='linear')
0.9351014778325123
{'C': 10, 'kernel': 'linear'}


accuracy:  0.9351014778325123
this is the default for confusion matrix()
[['TN' 'FP']
 ['FN' 'TP']]
confusion matrix: 
[[779  17]
 [  4 825]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       796
           1       0.98      1.00      0.99       829

    accuracy                           0.99      1625
   macro avg       0.99      0.99      0.99      1625
weighted avg       0.99      0.99      0.99      1625


# feature_selection using LinearSVC and SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
print(lsvc.score(X, y))
sfm = SelectFromModel(lsvc, prefit=True)
X_new = sfm.transform(X)

0.9446017481226148


print(len(xcols[sfm.get_support()]))
xcols[sfm.get_support()]

18

Index(['cap_surface', 'cap_color', 'bruises', 'odor', 'gill_spacing',
       'gill_size', 'gill_color', 'stalk_shape', 'stalk_root',
       'stalk_surface_above_ring', 'stalk_surface_below_ring',
       'stalk_color_above_ring', 'stalk_color_below_ring', 'veil_color',
       'ring_type', 'spore_print_color', 'population', 'habitat'],
      dtype='object')


# feature_selection using LinearSVC and SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(Xo, yo)
print(lsvc.score(Xo, yo))
sfm = SelectFromModel(lsvc, prefit=True)
X_new = sfm.transform(Xo)
print(len(xcols[sfm.get_support()]))
xcols[sfm.get_support()]

0.9859657761910624
14

Index(['cap_shape', 'cap_color', 'odor', 'gill_spacing', 'gill_size',
       'stalk_shape', 'stalk_root', 'stalk_surface_above_ring',
       'stalk_surface_below_ring', 'stalk_color_above_ring',
       'stalk_color_below_ring', 'ring_type', 'spore_print_color', 'habitat'],
      dtype='object')


# feature_selection using LinearSVC and SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
lsvc = LinearSVC(C=10.0, penalty="l2", dual=False).fit(X, y)
print(lsvc.score(X, y))
sfm = SelectFromModel(lsvc, prefit=True)
print(len(xcols[sfm.get_support()]))
xcols[sfm.get_support()]

0.9514957528006894
5

Index(['gill_attachment', 'gill_spacing', 'gill_size',
       'stalk_surface_above_ring', 'veil_color'],
      dtype='object')


# feature_selection using LinearSVC and SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
lsvc = LinearSVC(C=10.0, penalty="l2", dual=False).fit(Xo, yo)
print(lsvc.score(Xo, yo))
sfm = SelectFromModel(lsvc, prefit=True)
# X_new = sfm.transform(X)
print(len(xcols[sfm.get_support()]))
xcols[sfm.get_support()]

1.0
7

Index(['gill_attachment', 'gill_spacing', 'gill_size', 'stalk_shape',
       'stalk_surface_above_ring', 'veil_color', 'ring_number'],
      dtype='object')


lsvc.score(Xo, yo)

1.0


# seee if this can be improved and try extract important features
X_train, X_test, y_train, y_test = train_test_split(Xo, yo, test_size = 0.2, random_state = 1)
gnb = GaussianNB()
gnb.fit(X_train, y_train)
print("accuracy: ",gnb.score(X_test,y_test))
y_pred=gnb.predict(X_test)
print("this is the default for confusion matrix()")
print(np.asarray([['TN', 'FP'], ['FN', 'TP']]))
print("confusion matrix: ")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

accuracy:  0.976
this is the default for confusion matrix()
[['TN' 'FP']
 ['FN' 'TP']]
confusion matrix: 
[[783  13]
 [ 26 803]]
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       796
           1       0.98      0.97      0.98       829

    accuracy                           0.98      1625
   macro avg       0.98      0.98      0.98      1625
weighted avg       0.98      0.98      0.98      1625


X_train, X_test, y_train, y_test = train_test_split(Xo, yo, test_size = 0.2, random_state = 42)
#  this is improved  from gaussian model but how to extract important features
from sklearn.naive_bayes import CategoricalNB
cnb = CategoricalNB()
cnb.fit(X_train, y_train)
print("accuracy: ",cnb.score(X_test,y_test))
y_pred=cnb.predict(X_test)
print("this is the default for confusion matrix()")
print(np.asarray([['TN', 'FP'], ['FN', 'TP']]))
print("confusion matrix: ")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

cnb.get_params()
# cnb.predict_proba(X_test[0,:])

accuracy:  0.9569230769230769
this is the default for confusion matrix()
[['TN' 'FP']
 ['FN' 'TP']]
confusion matrix: 
[[847   6]
 [ 64 708]]
              precision    recall  f1-score   support

           0       0.93      0.99      0.96       853
           1       0.99      0.92      0.95       772

    accuracy                           0.96      1625
   macro avg       0.96      0.96      0.96      1625
weighted avg       0.96      0.96      0.96      1625

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True, 'min_categories': None}


cnb_probs = cnb.predict_proba(X_test)[:,1]
cnb_fpr, cnb_tpr, _ = roc_curve(y_test, cnb_probs)
cnb_roc_auc = auc(cnb_fpr, cnb_tpr)
ax=plt.plot(cnb_fpr, cnb_tpr,label = 'ROC curve (area = %0.2f)' % cnb_roc_auc)
ax=plt.plot([0.0,1.0],[0.0,1.0],linestyle='--',c='k')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()


## 
X_train, X_test, y_train, y_test = train_test_split(Xo, yo, test_size = 0.2, random_state = 1)
best_Kvalue = 0
best_score = 0

for i in range(1,10):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    print(knn.score(X_test,y_test))
    if knn.score(X_test,y_test) > best_score:
        best_score = knn.score(X_test,y_test)
        best_Kvalue = i

print("""Best KNN Value: {}
Test Accuracy: {}%""".format(best_Kvalue, round(best_score*100,2)))

1.0
0.9993846153846154
1.0
1.0
1.0
1.0
1.0
0.9993846153846154
1.0
Best KNN Value: 1
Test Accuracy: 100.0%


from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline


dtc=DecisionTreeClassifier()

dtc_params = {
    'criterion': ['gini', 'entropy']
    #'max_features': ['auto', 'sqrt', 'log2'],
    #'max_depth': ['none',10,20] ,
    #'min_samples_split' : [2,5,10],
    #'min_samples_leaf': [1, 2, 4],
    #'class_weight': [None, "balanced"],
    #'min_impurity_decrease': [0.0, 0.1, 0.2]
}


DTCclf = GridSearchCV(dtc, dtc_params,scoring='f1',cv=5,verbose=3) # scoring='f1',scoring=None ,cv=6, verbose=0 
DTCclf.fit(X, y)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END ....................criterion=gini;, score=0.861 total time=   0.0s
[CV 2/5] END ....................criterion=gini;, score=1.000 total time=   0.0s
[CV 3/5] END ....................criterion=gini;, score=0.997 total time=   0.0s
[CV 4/5] END ....................criterion=gini;, score=1.000 total time=   0.0s
[CV 5/5] END ....................criterion=gini;, score=0.752 total time=   0.0s
[CV 1/5] END .................criterion=entropy;, score=0.805 total time=   0.0s
[CV 2/5] END .................criterion=entropy;, score=1.000 total time=   0.0s
[CV 3/5] END .................criterion=entropy;, score=1.000 total time=   0.0s
[CV 4/5] END .................criterion=entropy;, score=1.000 total time=   0.0s
[CV 5/5] END .................criterion=entropy;, score=0.757 total time=   0.0s

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy']}, scoring='f1',
             verbose=3)


# this one below gave a better score even without removing outliers 
print(DTCclf.best_estimator_)
print(DTCclf.best_score_)
print(DTCclf.best_params_)
best_params = DTCclf.best_params_

DecisionTreeClassifier()
0.9220454941623997
{'criterion': 'gini'}


depth = DTCclf.best_estimator_.tree_.max_depth  # Get the depth of the tree
num_nodes = DTCclf.best_estimator_.tree_.node_count  # Get the number of nodes in the tree

print("Depth of the decision tree:", depth)
print("Number of nodes in the decision tree:", num_nodes)

Depth of the decision tree: 7
Number of nodes in the decision tree: 39


from sklearn.model_selection import cross_val_score
dtc=DecisionTreeClassifier(criterion='gini')
print(cross_val_score(dtc, X, y, cv=5))
dtc=DecisionTreeClassifier(criterion='entropy')
print(cross_val_score(dtc, X, y, cv=5))
dtc=DecisionTreeClassifier(criterion='gini')
print(cross_val_score(dtc, Xo, y, cv=5))
dtc=DecisionTreeClassifier(criterion='entropy')
print(cross_val_score(dtc, Xo, y, cv=5))

[0.84307692 1.         1.         1.         0.92610837]
[0.84307692 1.         1.         1.         0.95566502]
[1.         1.         1.         1.         0.69950739]
[1.         1.         1.         1.         0.78817734]


X_train, X_test, y_train, y_test = train_test_split(Xo, yo, test_size = 0.2, random_state = 0 ) 
# Make predictions on the testing data
dtc= DecisionTreeClassifier(criterion='gini')
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)
 # Evaluate the model's accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))
 # Print the confusion matrix
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
depth = dtc.tree_.max_depth  # Get the depth of the tree
num_nodes = dtc.tree_.node_count  # Get the number of nodes in the tree

print("Depth of the decision tree:", depth)
print("Number of nodes in the decision tree:", num_nodes)

Accuracy: 1.0
Confusion matrix:
[[838   0]
 [  0 787]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       838
           1       1.00      1.00      1.00       787

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625

Depth of the decision tree: 7
Number of nodes in the decision tree: 17


ax=sns.barplot(x=df_encoded.drop(['class','veil_type'],axis=1).columns, y=dtc.feature_importances_,palette='tab10',label='DTC Feature importances ') 
ax=plt.xticks(rotation=90)
ax=plt.legend()


# GINI get smaller as we have less impurity 
from sklearn import tree
ax=plt.figure(figsize=[12,10])
ax= tree.plot_tree(dtc,max_depth=2,filled=True,feature_names=df_encoded.drop(['class','veil_type'],axis=1).columns,class_names=['e','p'])


# feature_selection using best estimator and SelectFromModel
from sklearn.feature_selection import SelectFromModel
bdtc = dtc.fit(Xo, yo)
sfm = SelectFromModel(bdtc, prefit=True)
X_new = sfm.transform(Xo)
print(np.shape(X_new))
columns[sfm.get_support()]

(8123, 1)

Index(['odor'], dtype='object')


# feature_selection using best estimator and SelectFromModel
from sklearn.feature_selection import SelectFromModel
bdtc = dtc.fit(X, y)
sfm = SelectFromModel(bdtc, prefit=True)
X_new = sfm.transform(X)
print(np.shape(X_new))
columns[sfm.get_support()]

(8123, 4)

Index(['gill_size', 'gill_color', 'spore_print_color', 'population'], dtype='object')


dtc=DecisionTreeClassifier(criterion='gini')
cross_val_score(dtc, X_new, y, cv=5)

array([0.91692308, 0.98953846, 0.98276923, 0.99876847, 0.64593596])


X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size = 0.2, random_state = 0 ) 
# Make predictions on the testing data
dtc= DecisionTreeClassifier(criterion='gini')
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)
 # Evaluate the model's accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))
 # Print the confusion matrix
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
dtc.feature_importances_

Accuracy: 0.9858461538461538
Confusion matrix:
[[830   8]
 [ 15 772]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       838
           1       0.99      0.98      0.99       787

    accuracy                           0.99      1625
   macro avg       0.99      0.99      0.99      1625
weighted avg       0.99      0.99      0.99      1625

array([0.1616803 , 0.36786823, 0.25534178, 0.21510968])


ax=sns.barplot(x=columns[sfm.get_support()], y=dtc.feature_importances_,palette='tab10',label='DTC Feature importances ') 
ax=plt.xticks(rotation=90)
ax=plt.legend()


depth = dtc.tree_.max_depth  # Get the depth of the tree
num_nodes = dtc.tree_.node_count  # Get the number of nodes in the tree

print("Depth of the decision tree:", depth)
print("Number of nodes in the decision tree:", num_nodes)

Depth of the decision tree: 11
Number of nodes in the decision tree: 63


# GINI get smaller as we have less impurity 
from sklearn import tree
ax=plt.figure(figsize=[12,10])
ax= tree.plot_tree(dtc,max_depth=2,filled=True,feature_names=columns[sfm.get_support()],class_names=['e','p'])


# fixing the depth 
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size = 0.2, random_state = 0 ) 
# Make predictions on the testing data
dtc= DecisionTreeClassifier(criterion='gini', max_depth=4)
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)
 # Evaluate the model's accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))
 # Print the confusion matrix
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
depth = dtc.tree_.max_depth  # Get the depth of the tree
num_nodes = dtc.tree_.node_count  # Get the number of nodes in the tree

print("Depth of the decision tree:", depth)
print("Number of nodes in the decision tree:", num_nodes)

Accuracy: 0.9673846153846154
Confusion matrix:
[[805  33]
 [ 20 767]]
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       838
           1       0.96      0.97      0.97       787

    accuracy                           0.97      1625
   macro avg       0.97      0.97      0.97      1625
weighted avg       0.97      0.97      0.97      1625

Depth of the decision tree: 4
Number of nodes in the decision tree: 25


# GINI get smaller as we have less impurity 
# the tree is the same as above it just stoped at the specified depth
from sklearn import tree
ax=plt.figure(figsize=[12,10])
ax= tree.plot_tree(dtc,max_depth=2,filled=True,feature_names=columns[sfm.get_support()],class_names=['e','p'])


from sklearn.model_selection import learning_curve
dtc= DecisionTreeClassifier(criterion='gini')
train_sizes, train_scores, valid_scores = learning_curve(dtc, X, y, train_sizes=np.linspace(0.1,1.0,20), cv=5, shuffle=False)


my_mean_valid_scores=np.mean(np.vstack(valid_scores),axis=1)

rts= np.reshape(train_scores.T,(1,-1))[0]
rvs= np.reshape(valid_scores.T,(1,-1))[0]
df=pd.DataFrame([rts,rvs,list(train_sizes)*5]).T
df.columns=['train_scores','valid_scores','train_sizes']
ax= sns.lineplot(data=df,x='train_sizes',y='train_scores',marker='v',label='train score')
ax= sns.lineplot(data=df,x='train_sizes',y='valid_scores',label='test score')
ax= sns.scatterplot(x=train_sizes ,y=my_mean_valid_scores,label='mean valid score',c='k',alpha=0.6)
ax=plt.legend(loc=4)


# reset data arrays
print(df_encoded.odor.head(10))
print(df_oencoded.odor.head(10))
X=df_encoded.drop(['class','veil_type'],axis=1).values
y=df_encoded['class'].values  # 1 is for poisonous 
Xo=df_oencoded.drop(['class','veil_type'],axis=1).values
yo=df_oencoded['class'].values  # 1 is for poisonous

0    0
1    3
2    6
3    5
4    0
5    0
6    3
7    6
8    0
9    3
Name: odor, dtype: int8
0    1
1    2
2    5
3    0
4    1
5    1
6    2
7    5
8    1
9    2
Name: odor, dtype: int64


from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline

np.random.seed(42)

#steps = [('scaler', StandardScaler()),
#         ('DTC', DecisionTreeClassifier())]
dtc=DecisionTreeClassifier()
# There was a trick with specifying params when using pipeline and Grid Search
# note the SVC__ which 

dtc_params = {
    'criterion': ['gini', 'entropy'],
    'max_features': ['auto', 'sqrt', 'log2'],
  #  'DTC__max_depth': ['none',10,20] ,
    'min_samples_split' : [2,5,10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': [None, "balanced"],
    'min_impurity_decrease': [0.0, 0.1, 0.2]
}

# dtc_pipeline = Pipeline(steps)
DTCclf = GridSearchCV(dtc, dtc_params, scoring='f1',cv=5, verbose=1) # scoring='f1'
DTCclf.fit(X, y)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'class_weight': [None, 'balanced'],
                         'criterion': ['gini', 'entropy'],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_impurity_decrease': [0.0, 0.1, 0.2],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10]},
             scoring='f1', verbose=1)


print(DTCclf.best_estimator_)
print(DTCclf.best_score_)
print(DTCclf.best_params_)
best_params = DTCclf.best_params_

depth = DTCclf.best_estimator_.tree_.max_depth  # Get the depth of the tree
num_nodes = DTCclf.best_estimator_.tree_.node_count  # Get the number of nodes in the tree

print("Depth of the decision tree:", depth)
print("Number of nodes in the decision tree:", num_nodes)

DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
                       max_features='auto', min_samples_split=10)
0.965881777454906
{'class_weight': 'balanced', 'criterion': 'entropy', 'max_features': 'auto', 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 10}
Depth of the decision tree: 11
Number of nodes in the decision tree: 47


ax=sns.barplot(x=xcols, y=DTCclf.best_estimator_.feature_importances_,palette='tab10',label='DTC Feature importances ') 
ax=plt.xticks(rotation=90)
ax=plt.legend()


np.shape(X)

(8123, 21)


# feature_selection using best estimator and SelectFromModel
bdtc = DTCclf.best_estimator_   # no need to refit the best estimator 
sfm = SelectFromModel(bdtc, prefit=True)
X_new = sfm.transform(X)
print(np.shape(X_new))
xcols_new=xcols[sfm.get_support()]
print(xcols_new)

(8123, 4)
Index(['odor', 'gill_size', 'ring_type', 'spore_print_color'], dtype='object')


# fixing the depth 
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size = 0.2, random_state = 42 ) 
np.shape(X_train)
# Make predictions on the testing data
DTCclf.best_estimator_.fit(X_train, y_train)
y_pred = DTCclf.best_estimator_.predict(X_test)
 # Evaluate the model's accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))
 # Print the confusion matrix
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
depth = DTCclf.best_estimator_.tree_.max_depth  # Get the depth of the tree
num_nodes = DTCclf.best_estimator_.tree_.node_count  # Get the number of nodes in the tree

print("Depth of the decision tree:", depth)
print("Number of nodes in the decision tree:", num_nodes)
DTCclf.best_estimator_.feature_importances_

Accuracy: 0.9944615384615385
Confusion matrix:
[[844   9]
 [  0 772]]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       853
           1       0.99      1.00      0.99       772

    accuracy                           0.99      1625
   macro avg       0.99      0.99      0.99      1625
weighted avg       0.99      0.99      0.99      1625

Depth of the decision tree: 8
Number of nodes in the decision tree: 39

array([0.48986489, 0.24054414, 0.01317085, 0.25642012])


ax=sns.barplot(x=xcols_new, y=DTCclf.best_estimator_.feature_importances_,palette='tab10',label='DTC Feature importances ') 
ax=plt.xticks(rotation=90)
ax=plt.legend()


# GINI get smaller as we have less impurity 
# the tree is the same as above it just stoped at the specified depth
ax=plt.figure(figsize=[12,10])
ax= tree.plot_tree(DTCclf.best_estimator_,max_depth=3,filled=True,feature_names=xcols_new ,class_names=['e','p'])


# this is more of an unsupervised learning excersis to determine feature importance 
# not going to include this

import numpy as np
from sklearn.decomposition import PCA

scaler = StandardScaler()
 # fit and transform the data
scaler.fit(X)
# X_scaled = scaler.fit_transform(X)
# pca = PCA(n_components=0.95)
# pca = PCA(n_components=2)
pca0 = PCA(n_components=21)
pca = PCA(n_components=21)
pca.fit(X_scaled)
pca0.fit(X)
# print(pca.explained_variance_ratio_)

# print(pca.singular_values_)  # eignvalue from SVD are ordered 
# print(pca.components_)     # this are eignvectors 


 # Create Scree Plot
plt.plot(range(1, pca.n_components_+1), np.cumsum(pca.explained_variance_ratio_), 'bo-', linewidth=2,label='scaled')
plt.plot(range(1, pca0.n_components_+1), np.cumsum(pca0.explained_variance_ratio_), 'ro-', linewidth=2,label='unscaled')
plt.axhline(1,linestyle='--',c='k')
plt.axvline(10,linestyle=':',c='g')
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('% of Variance Explained')
plt.legend()
plt.show()


ev=pd.DataFrame(pca.components_[:2],columns=df_encoded.drop(['class','veil_type'],axis=1).columns)
# print(pca.components_[:2])
# print(ev.head())
# ev.plot.bar()
# ax=ev.iloc[0].plot.bar(palette='tab10')
# sns.barplot(data=ev.iloc[0])
# sns.barplot(data=ev)
# ax=plt.xticks(rotation=90)


 # plot the eigenvectors as barplots
colors = ['red', 'green', 'blue', 'orange', 'purple', 'brown', 'pink', 'gray', 'olive'] 
palette = sns.color_palette('Dark2', n_colors=pca.components_.shape[0])
palette = sns.color_palette('Dark2')
columns=df_encoded.drop(['class','veil_type'],axis=1).columns
plt.figure(figsize=(15, 12))
for i in range(pca.components_[:4].shape[0]):
    plt.subplot(2, 2, i+1)
    bars=plt.bar(range(pca.components_.shape[1]), pca.components_[i],color=palette)
    plt.title(f"Principal component {i+1} (ExplVar: %.3f)" %pca.explained_variance_ratio_[i])
    plt.xticks(range(X.shape[1]), columns)
    for bar in bars:
        plt.text(bar.get_x() + bar.get_width() / 2.0, bar.get_height(), round(bar.get_height(), 2), ha='center', va='bottom')
    bars=plt.xticks(rotation=90)
plt.tight_layout()
plt.show()


# this is more of an unsupervised learning excersis to determine feature importance  


scaler = StandardScaler()
 # fit and transform the data
scaler.fit(Xo)
Xo_scaled = scaler.fit_transform(Xo)
# pca = PCA(n_components=0.95)
# pca = PCA(n_components=2)
pca0 = PCA(n_components=21)
pca = PCA(n_components=21)
pca.fit(Xo_scaled)
pca0.fit(Xo)
# print(pca.explained_variance_ratio_)

# print(pca.singular_values_)  # eignvalue from SVD are ordered 
# print(pca.components_)     # this are eignvectors 


 # Create Scree Plot
plt.plot(range(1, pca.n_components_+1), np.cumsum(pca.explained_variance_ratio_), 'bo-', linewidth=2,label='scaled')
plt.plot(range(1, pca0.n_components_+1), np.cumsum(pca0.explained_variance_ratio_), 'ro-', linewidth=2,label='unscaled')
plt.axhline(1,linestyle='--',c='k')
plt.axvline(10,linestyle=':',c='g')
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('% of Variance Explained')
plt.legend()
plt.show()


ev=pd.DataFrame(pca.components_[:2],columns=xcols)
# print(pca.components_[:2])
# print(ev.head())
# ev.plot.bar()
# ax=ev.iloc[0].plot.bar(palette='tab10')
# sns.barplot(data=ev.iloc[0])
# sns.barplot(data=ev)
# ax=plt.xticks(rotation=90)


 # plot the eigenvectors as barplots
colors = ['red', 'green', 'blue', 'orange', 'purple', 'brown', 'pink', 'gray', 'olive'] 
palette = sns.color_palette('Dark2', n_colors=pca.components_.shape[0])
palette = sns.color_palette('Dark2')
columns=df_encoded.drop(['class','veil_type'],axis=1).columns
plt.figure(figsize=(15, 12))
for i in range(pca.components_[:4].shape[0]):
    plt.subplot(2, 2, i+1)
    bars=plt.bar(range(pca.components_.shape[1]), pca.components_[i],color=palette)
    plt.title(f"Principal component {i+1} (ExplVar: %.3f)" %pca.explained_variance_ratio_[i])
    plt.xticks(range(X.shape[1]), columns)
    for bar in bars:
        plt.text(bar.get_x() + bar.get_width() / 2.0, bar.get_height(), round(bar.get_height(), 2), ha='center', va='bottom')
    bars=plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

Import modules¶

Read Data¶

EDA and some vizies¶

include logical tests here¶

We can see that just by using simple 'odor' and 'spore_print_color' we can distinguish if a mushroom is poisonous¶

ie. a foul smelling mushroom with green spores is more likely to be poisonous.¶

Encoding the categorical vars¶

we are using nominal encoding and a modified data set where the odor is ordinally encoded.¶

above was nomianlly ordered¶

note: the '?' for stalk root has been coded as a 0¶

The veil_type variable was fixed to a single value so this axis is removed.¶

It is worth noting in many cases in this notebook to perform analogous calculation on either the nominal or 'odor'-ordinal dataset, I have used the same cell and replaced the input X,y with Xo,yo¶

Logistic regression¶

Plot coefficents for L2 regularization¶

Example of how RFE is picking out the same best features based on coef¶

Logistic regression on L1¶

plot coefficents for L1 regularization¶

below RFECV fit takes a while¶

SVM part¶

Have also considered LinearSVC¶

Also going to test out the Feature selection using SelectFromModel¶

Naive Bayes¶

Descision tree with Grid search and feature selection¶

learning curve¶

Principal Component Analysis¶

check PCA with the odors as ordinal¶

	p	x	s	n	t	p.1	f	c	n.1	k	...	s.2	w	w.1	p.2	w.2	o	p.3	k.1	s.3	u
count	8123	8123	8123	8123	8123	8123	8123	8123	8123	8123	...	8123	8123	8123	8123	8123	8123	8123	8123	8123	8123
unique	2	6	4	10	2	9	2	2	2	12	...	4	9	9	1	4	3	5	9	6	7
top	e	x	y	n	f	n	f	c	b	b	...	s	w	w	p	w	o	p	w	v	d
freq	4208	3655	3244	2283	4748	3528	7913	6811	5612	1728	...	4935	4463	4383	8123	7923	7487	3967	2388	4040	3148

	class	cap_shape	cap_surface	cap_color	bruises	odor	gill_attachment	gill_spacing	gill_size	gill_color	...	stalk_surface_below_ring	stalk_color_above_ring	stalk_color_below_ring	veil_type	veil_color	ring_number	ring_type	spore_print_color	population	habitat
0	e	x	s	y	t	a	f	c	b	k	...	s	w	w	p	w	o	p	n	n	g
1	e	b	s	w	t	l	f	c	b	n	...	s	w	w	p	w	o	p	n	n	m
2	p	x	y	w	t	p	f	c	n	n	...	s	w	w	p	w	o	p	k	s	u
3	e	x	s	g	f	n	f	w	b	k	...	s	w	w	p	w	o	e	n	a	g
4	e	x	y	y	t	a	f	c	b	n	...	s	w	w	p	w	o	p	k	n	g

	class	cap_shape	cap_surface	cap_color	bruises	odor	gill_attachment	gill_spacing	gill_size	gill_color	...	stalk_surface_below_ring	stalk_color_above_ring	stalk_color_below_ring	veil_color	ring_number	ring_type	spore_print_color	population	habitat
0	0	5	2	9	1	0	1	0	0	4	...	2	7	7	2	1	4	3	2	1
1	0	0	2	8	1	3	1	0	0	5	...	2	7	7	2	1	4	3	2	3
2	1	5	3	8	1	6	1	0	1	5	...	2	7	7	2	1	4	2	3	5
3	0	5	2	3	0	5	1	1	0	4	...	2	7	7	2	1	0	3	0	1
4	0	5	3	9	1	0	1	0	0	5	...	2	7	7	2	1	4	2	2	1

	class	cap_shape	cap_surface	cap_color	bruises	odor	gill_attachment	gill_spacing	gill_size	gill_color	...	stalk_surface_below_ring	stalk_color_above_ring	stalk_color_below_ring	veil_type	veil_color	ring_number	ring_type	spore_print_color	population	habitat
0	e	x	s	y	t	1	f	c	b	k	...	s	w	w	p	w	o	p	n	n	g
1	e	b	s	w	t	2	f	c	b	n	...	s	w	w	p	w	o	p	n	n	m
2	p	x	y	w	t	5	f	c	n	n	...	s	w	w	p	w	o	p	k	s	u
3	e	x	s	g	f	0	f	w	b	k	...	s	w	w	p	w	o	e	n	a	g
4	e	x	y	y	t	1	f	c	b	n	...	s	w	w	p	w	o	p	k	n	g
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
8118	e	k	s	n	f	0	a	c	b	y	...	s	o	o	p	o	o	p	b	c	l
8119	e	x	s	n	f	0	a	c	b	y	...	s	o	o	p	n	o	p	b	v	l
8120	e	f	s	n	f	0	a	c	b	n	...	s	o	o	p	o	o	p	b	c	l
8121	p	k	y	n	f	7	f	c	n	b	...	k	w	w	p	w	o	e	w	v	l
8122	e	x	s	n	f	0	a	c	b	y	...	s	o	o	p	o	o	p	o	c	l

	class	cap_shape	cap_surface	cap_color	bruises	odor	gill_attachment	gill_spacing	gill_size	gill_color	...	stalk_surface_below_ring	stalk_color_above_ring	stalk_color_below_ring	veil_color	ring_number	ring_type	spore_print_color	population	habitat
0	0	5	2	9	1	0	1	0	0	4	...	2	7	7	2	1	4	3	2	1
1	0	0	2	8	1	3	1	0	0	5	...	2	7	7	2	1	4	3	2	3
2	1	5	3	8	1	6	1	0	1	5	...	2	7	7	2	1	4	2	3	5
3	0	5	2	3	0	5	1	1	0	4	...	2	7	7	2	1	0	3	0	1
4	0	5	3	9	1	0	1	0	0	5	...	2	7	7	2	1	4	2	2	1

	class	cap_shape	cap_surface	cap_color	bruises	odor	gill_attachment	gill_spacing	gill_size	gill_color	...	stalk_surface_below_ring	stalk_color_above_ring	stalk_color_below_ring	veil_type	veil_color	ring_number	ring_type	spore_print_color	population	habitat
0	e	x	s	y	t	a	f	c	b	k	...	s	w	w	p	w	o	p	n	n	g
1	e	b	s	w	t	l	f	c	b	n	...	s	w	w	p	w	o	p	n	n	m
2	p	x	y	w	t	p	f	c	n	n	...	s	w	w	p	w	o	p	k	s	u
3	e	x	s	g	f	n	f	w	b	k	...	s	w	w	p	w	o	e	n	a	g
4	e	x	y	y	t	a	f	c	b	n	...	s	w	w	p	w	o	p	k	n	g

	class	cap_shape	cap_surface	cap_color	bruises	odor	gill_attachment	gill_spacing	gill_size	gill_color	...	stalk_surface_below_ring	stalk_color_above_ring	stalk_color_below_ring	veil_color	ring_number	ring_type	spore_print_color	population	habitat
0	0	5	2	9	1	0	1	0	0	4	...	2	7	7	2	1	4	3	2	1
1	0	0	2	8	1	3	1	0	0	5	...	2	7	7	2	1	4	3	2	3
2	1	5	3	8	1	6	1	0	1	5	...	2	7	7	2	1	4	2	3	5
3	0	5	2	3	0	5	1	1	0	4	...	2	7	7	2	1	0	3	0	1
4	0	5	3	9	1	0	1	0	0	5	...	2	7	7	2	1	4	2	2	1

	class	cap_shape	cap_surface	cap_color	bruises	odor	gill_attachment	gill_spacing	gill_size	gill_color	...	stalk_surface_below_ring	stalk_color_above_ring	stalk_color_below_ring	veil_color	ring_number	ring_type	spore_print_color	population	habitat
0	0	5	2	9	1	0	1	0	0	4	...	2	7	7	2	1	4	3	2	1
1	0	0	2	8	1	3	1	0	0	5	...	2	7	7	2	1	4	3	2	3
2	1	5	3	8	1	6	1	0	1	5	...	2	7	7	2	1	4	2	3	5
3	0	5	2	3	0	5	1	1	0	4	...	2	7	7	2	1	0	3	0	1
4	0	5	3	9	1	0	1	0	0	5	...	2	7	7	2	1	4	2	2	1

	class	cap_shape	cap_surface	cap_color	bruises	odor	gill_attachment	gill_spacing	gill_size	gill_color	...	stalk_surface_below_ring	stalk_color_above_ring	stalk_color_below_ring	veil_type	veil_color	ring_number	ring_type	spore_print_color	population	habitat
0	e	x	s	y	t	a	f	c	b	k	...	s	w	w	p	w	o	p	n	n	g
1	e	b	s	w	t	l	f	c	b	n	...	s	w	w	p	w	o	p	n	n	m
2	p	x	y	w	t	p	f	c	n	n	...	s	w	w	p	w	o	p	k	s	u
3	e	x	s	g	f	n	f	w	b	k	...	s	w	w	p	w	o	e	n	a	g
4	e	x	y	y	t	a	f	c	b	n	...	s	w	w	p	w	o	p	k	n	g

	class	cap_shape	cap_surface	cap_color	bruises	odor	gill_attachment	gill_spacing	gill_size	gill_color	...	stalk_surface_below_ring	stalk_color_above_ring	stalk_color_below_ring	veil_color	ring_number	ring_type	spore_print_color	population	habitat
0	0	5	2	9	1	0	1	0	0	4	...	2	7	7	2	1	4	3	2	1
1	0	0	2	8	1	3	1	0	0	5	...	2	7	7	2	1	4	3	2	3
2	1	5	3	8	1	6	1	0	1	5	...	2	7	7	2	1	4	2	3	5
3	0	5	2	3	0	5	1	1	0	4	...	2	7	7	2	1	0	3	0	1
4	0	5	3	9	1	0	1	0	0	5	...	2	7	7	2	1	4	2	2	1

	class	cap_shape	cap_surface	cap_color	bruises	odor	gill_attachment	gill_spacing	gill_size	gill_color	...	stalk_surface_below_ring	stalk_color_above_ring	stalk_color_below_ring	veil_color	ring_number	ring_type	spore_print_color	population	habitat
0	0	5	2	9	1	0	1	0	0	4	...	2	7	7	2	1	4	3	2	1
1	0	0	2	8	1	3	1	0	0	5	...	2	7	7	2	1	4	3	2	3
2	1	5	3	8	1	6	1	0	1	5	...	2	7	7	2	1	4	2	3	5
3	0	5	2	3	0	5	1	1	0	4	...	2	7	7	2	1	0	3	0	1
4	0	5	3	9	1	0	1	0	0	5	...	2	7	7	2	1	4	2	2	1