import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm
import pandas as pd
import mglearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
import pydotplus
from IPython.display import Image  

pd.options.mode.chained_assignment = None  # default='warn'

matplotlib.style.use('ggplot')
%matplotlib inline
matplotlib.rcParams.update({'font.size': 15})

def plot_feature_importances_mydata(model):
    n_features = X_train.shape[1]
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), list(mydata)[:-1])
    plt.xlabel("Variable importance")
    plt.ylabel("Independent Variable")


#Read the voice dataset
mydata = pd.read_csv("data/voice.csv")

mydata


#Plot the histograms
male = mydata.loc[mydata['label']=='male']
female = mydata.loc[mydata['label']=='female']
fig, axes = plt.subplots(10, 2, figsize=(20,40))
ax = axes.ravel()
for i in range(20):
    ax[i].hist(male.iloc[:,i], bins=20, color=mglearn.cm3(0), alpha=.5)
    ax[i].hist(female.iloc[:, i], bins=20, color=mglearn.cm3(2), alpha=.5)
    ax[i].set_title(list(male)[i])
    ax[i].set_yticks(())
    
ax[0].set_xlabel("Feature magnitude")
ax[0].set_ylabel("Frequency")
ax[0].legend(["male", "female"], loc="best")
fig.tight_layout()

plt.show()


#Prepare data for modeling
mydata.loc[:,'label'][mydata['label']=="male"] = 0
mydata.loc[:,'label'][mydata['label']=="female"] = 1
mydata_train, mydata_test = train_test_split(mydata, random_state=0, test_size=.2)
scaler = StandardScaler()
scaler.fit(mydata_train.iloc[:,0:20])
X_train = scaler.transform(mydata_train.iloc[:,0:20])
X_test = scaler.transform(mydata_test.iloc[:,0:20])
y_train = list(mydata_train['label'].values)
y_test = list(mydata_test['label'].values)


#Train decision tree model
tree = DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=1, max_leaf_nodes=None, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0, random_state=100, splitter='best')

tree.fit(X_train, y_train)
print("Decision Tree")
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

Decision Tree
Accuracy on training set: 0.951
Accuracy on test set: 0.935


l = list(mydata)
l.remove('label')
print(l)

n_features = X_train.shape[1]
print (n_features)

['meanfreq', 'sd', 'median', 'Q25', 'Q75', 'IQR', 'skew', 'kurt', 'sp.ent', 'sfm', 'mode', 'centroid', 'meanfun', 'minfun', 'maxfun', 'meandom', 'mindom', 'maxdom', 'dfrange', 'modindx']
20


dot_data = export_graphviz(tree, out_file=None, 
                         feature_names=l,  
                         filled=True, rounded=True,  
                         special_characters=True)  
# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)  

# Show graph
Image(graph.create_png())

#graph = graphviz.Source(dot_data)  
#graph


matplotlib.rcParams.update({'font.size': 15})
plt.figure(figsize=(15,10))

plot_feature_importances_mydata(tree)

plt.show()


#Prepare data for modeling
mydata1 = mydata[['meanfun','IQR','sd','label']]
mydata1.loc[:,'label'][mydata1['label']=="male"] = 0
mydata1.loc[:,'label'][mydata1['label']=="female"] = 1
mydata1_train, mydata1_test = train_test_split(mydata1, random_state=0, test_size=.2)
scaler = StandardScaler()
scaler.fit(mydata1_train.iloc[:,0:3])
X_train = scaler.transform(mydata1_train.iloc[:,0:3])
X_test = scaler.transform(mydata1_test.iloc[:,0:3])
y_train = list(mydata1_train['label'].values)
y_test = list(mydata1_test['label'].values)

tree = DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=1, max_leaf_nodes=None, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=100, splitter='best')

tree.fit(X_train, y_train)
print("Decision Tree")
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

Decision Tree
Accuracy on training set: 0.965
Accuracy on test set: 0.950


dot_data = export_graphviz(tree, out_file=None, 
                         feature_names=list(mydata1)[:-1],  
                         filled=True, rounded=True,  
                         special_characters=True)  
# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)  

# Show graph
Image(graph.create_png())


def plot_feature_importances_mydata1(model):
    n_features = X_train.shape[1]
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), list(mydata1)[:-1])
    plt.xlabel("Variable importance")
    plt.ylabel("Independent Variable")
    
plt.figure(figsize=(15,5))
plot_feature_importances_mydata1(tree)

plt.show()


#Read the voice dataset
mydata = pd.read_csv("data/voice.csv")

#Prepare data for modeling
mydata.loc[:,'label'][mydata['label']=="male"] = 0
mydata.loc[:,'label'][mydata['label']=="female"] = 1
mydata_train, mydata_test = train_test_split(mydata, random_state=0, test_size=.2)
scaler = StandardScaler()
scaler.fit(mydata_train.iloc[:,0:20])
X_train = scaler.transform(mydata_train.iloc[:,0:20])
X_test = scaler.transform(mydata_test.iloc[:,0:20])
y_train = list(mydata_train['label'].values)
y_test = list(mydata_test['label'].values)


#Train random forest model
forest = RandomForestClassifier(n_estimators=5, random_state=0).fit(X_train, y_train)
print("Random Forests")
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, y_test)))

matplotlib.rcParams.update({'font.size': 15})
plt.figure(figsize=(15,10))
plot_feature_importances_mydata(forest)

plt.show()

Random Forests
Accuracy on training set: 0.998
Accuracy on test set: 0.976


#Prepare data for modeling
mydata1 = mydata[['meanfun','IQR','sd','label']]
mydata1.loc[:,'label'][mydata1['label']=="male"] = 0
mydata1.loc[:,'label'][mydata1['label']=="female"] = 1
mydata1_train, mydata1_test = train_test_split(mydata1, random_state=0, test_size=.2)
scaler = StandardScaler()
scaler.fit(mydata1_train.iloc[:,0:3])
X_train = scaler.transform(mydata1_train.iloc[:,0:3])
X_test = scaler.transform(mydata1_test.iloc[:,0:3])
y_train = list(mydata1_train['label'].values)
y_test = list(mydata1_test['label'].values)

#Train random forest model
forest = RandomForestClassifier(n_estimators=5, random_state=0).fit(X_train, y_train)
print("Random Forests")
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, y_test)))

plt.figure(figsize=(15,5))
plot_feature_importances_mydata(forest)

plt.show()

Random Forests
Accuracy on training set: 0.996
Accuracy on test set: 0.972

	meanfreq	sd	median	Q25	Q75	IQR	skew	kurt	sp.ent	sfm	...	centroid	meanfun	minfun	maxfun	meandom	mindom	maxdom	dfrange	modindx	label
0	0.059781	0.064241	0.032027	0.015071	0.090193	0.075122	12.863462	274.402906	0.893369	0.491918	...	0.059781	0.084279	0.015702	0.275862	0.007812	0.007812	0.007812	0.000000	0.000000	male
1	0.066009	0.067310	0.040229	0.019414	0.092666	0.073252	22.423285	634.613855	0.892193	0.513724	...	0.066009	0.107937	0.015826	0.250000	0.009014	0.007812	0.054688	0.046875	0.052632	male
2	0.077316	0.083829	0.036718	0.008701	0.131908	0.123207	30.757155	1024.927705	0.846389	0.478905	...	0.077316	0.098706	0.015656	0.271186	0.007990	0.007812	0.015625	0.007812	0.046512	male
3	0.151228	0.072111	0.158011	0.096582	0.207955	0.111374	1.232831	4.177296	0.963322	0.727232	...	0.151228	0.088965	0.017798	0.250000	0.201497	0.007812	0.562500	0.554688	0.247119	male
4	0.135120	0.079146	0.124656	0.078720	0.206045	0.127325	1.101174	4.333713	0.971955	0.783568	...	0.135120	0.106398	0.016931	0.266667	0.712812	0.007812	5.484375	5.476562	0.208274	male
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
3163	0.131884	0.084734	0.153707	0.049285	0.201144	0.151859	1.762129	6.630383	0.962934	0.763182	...	0.131884	0.182790	0.083770	0.262295	0.832899	0.007812	4.210938	4.203125	0.161929	female
3164	0.116221	0.089221	0.076758	0.042718	0.204911	0.162193	0.693730	2.503954	0.960716	0.709570	...	0.116221	0.188980	0.034409	0.275862	0.909856	0.039062	3.679688	3.640625	0.277897	female
3165	0.142056	0.095798	0.183731	0.033424	0.224360	0.190936	1.876502	6.604509	0.946854	0.654196	...	0.142056	0.209918	0.039506	0.275862	0.494271	0.007812	2.937500	2.929688	0.194759	female
3166	0.143659	0.090628	0.184976	0.043508	0.219943	0.176435	1.591065	5.388298	0.950436	0.675470	...	0.143659	0.172375	0.034483	0.250000	0.791360	0.007812	3.593750	3.585938	0.311002	female
3167	0.165509	0.092884	0.183044	0.070072	0.250827	0.180756	1.705029	5.769115	0.938829	0.601529	...	0.165509	0.185607	0.062257	0.271186	0.227022	0.007812	0.554688	0.546875	0.350000	female

Voice Gender Classification¶

Gender Recognition by Voice and Speech Analysis¶

The Dataset¶

Gender classification - typical rules of thumb¶

Decision Tree-based Classification¶

Random Forest-based Classification¶