Gender recognition by voice¶
Credit: AITS Cainvas Community
Photo by Priyank Vyas on Dribbble
Using acoustic features extracted from a voice recording to predict the gender of the speaker.
Recognizing the speaker's gender is useful in automatic salutations, tagging audio recordings, seperating sounds belonging to a specific gender for analysis, etc. that can help digital personal assistants in reproducing male, female generic results.
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import random
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
Dataset¶
The dataset consists of 3,168 recorded voice samples, collected from male and female speakers. The voice samples are pre-processed by acoustic analysis using the seewave and tuneR packages, with an analyzed frequency range of 0hz-280hz (human vocal range)
voice = pd.read_csv('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/voice.csv')
voice
Looking into the spread of labels in the dataset.
voice['label'].value_counts()
It is a perfectly balanced dataset!
Preprocessing¶
Correlation¶
# Lets look at the correlation between the various attributes
corr = voice.corr()
corr
# There are many column pairs with high correlation, thus acting as duplicates.
# We can drop a column if there is a correlation > 0.95
final_columns = list(voice.columns) # maintaining a temporary list to remove columns from
for i in range(corr.shape[0]):
for j in range(i+1, corr.shape[0]):
if corr.iloc[i, j] >= 0.95: # if very high correlation
if list(voice.columns)[j] in final_columns: # if not aldready removed
final_columns.remove(list(voice.columns)[j]) # remove
voice = voice[final_columns] # selecting only the required columns
voice
The values of attributes in the dataset are not of the same range.
voice.describe()
Since the validation and test set values are used to tune and evaluate the performance of the model respectively on unseen data, we can fit the scaler to the values in the train set and then used it to scale the values of the val and test sets.
Defining the input features and output formats¶
# defining the input and output columns to separate the dataset in the later cells.
input_columns = list(voice.columns[:-1])
output_columns = ['male', 'female'] # column names to be used after one-hot encoding
print("Number of input columns: ", len(input_columns))
#print("Input columns: ", ', '.join(input_columns))
print("Number of output columns: ", len(output_columns))
#print("Output columns: ", ', '.join(output_columns))
As our model below has softmax activation function, we need to one-hot encode the classes.
# One hot encoding the labels (we can also use sigmoid activation and binary crossentropy instead)
y = pd.get_dummies(voice.label)
print(y)
# Adding the one hot encodings to the dataset
for x in output_columns:
voice[x] = y[x]
# Viewing the labels
voice
Splitting into train, val and test sets¶
# Splitting into train, val and test set -- 80-10-10 split
# First, an 80-20 split
train_df, val_test_df = train_test_split(voice, test_size = 0.2)
# Then split the 20% into half
val_df, test_df = train_test_split(val_test_df, test_size = 0.5)
print("Number of samples in...")
print("Training set: ", len(train_df))
print("Validation set: ", len(val_df))
print("Testing set: ", len(test_df))
# Splitting into X (input) and y (output)
Xtrain, ytrain = np.array(train_df[input_columns]), np.array(train_df[output_columns])
Xval, yval = np.array(val_df[input_columns]), np.array(val_df[output_columns])
Xtest, ytest = np.array(test_df[input_columns]), np.array(test_df[output_columns])
Scaling the attribute values¶
# Each feature has a different range.
# Using min_max_scaler to scale them to values in the range [0,1].
min_max_scaler = MinMaxScaler()
# Fit on training set alone
Xtrain = min_max_scaler.fit_transform(Xtrain)
# Use it to transform val and test input
Xval = min_max_scaler.transform(Xval)
Xtest = min_max_scaler.transform(Xtest)
voice.describe()
Model¶
model = tf.keras.Sequential([
layers.Dense(16, activation = 'relu', input_shape = Xtrain[0].shape),
layers.Dense(8, activation = 'relu'),
layers.Dense(len(output_columns), activation = 'softmax')
])
model.compile(optimizer = tf.keras.optimizers.Adam(0.01), loss = tf.losses.CategoricalCrossentropy(), metrics = ['accuracy'])
callbacks = [EarlyStopping(monitor='val_loss', patience = 5), # stop if the val_loss metric doesn;t increase for 5 epochs continuously.
ModelCheckpoint('gender_recognition_voice.h5', save_best_only=True)] # save the best model yet
model.summary()
history = model.fit(Xtrain, ytrain, validation_data = (Xval, yval), epochs=32, callbacks=callbacks)
model.load_weights('gender_recognition_voice.h5') # Load the weights of the best model
model.evaluate(Xtest, ytest)
Plotting the metrics¶
def plot(history, variable, variable1):
plt.plot(range(len(history[variable])), history[variable])
plt.plot(range(len(history[variable1])), history[variable1])
plt.title(variable)
plt.legend([variable, variable1])
plot(history.history, "accuracy", "val_accuracy")
plot(history.history, "loss", "val_loss")
Prediction¶
# Pick random test sample
i = random.randint(0, len(test_df)-1)
model_output = model.predict(Xtest[i].reshape(1, -1))[0]
pred = np.argmax(model_output)
# show predicted output
print ("\nModel predicted the gender: ", output_columns[pred])
# actual output
print("Actual gender: ", output_columns[np.argmax(ytest[i])], "with probability", model_output[pred])
deepC¶
!deepCC gender_recognition_voice.h5
# Pick random test sample
i = random.randint(0, len(test_df)-1)
np.savetxt('sample.data', Xtest[i])
# run exe with input
!gender_recognition_voice_deepC/gender_recognition_voice.exe sample.data
# show predicted output
nn_out = np.loadtxt('dense_2.out')
print ("\nModel predicted the gender: ", output_columns[np.argmax(nn_out)])
# actual output
print("Actual gender: ", output_columns[np.argmax(ytest[i])])