Parkinson's Disease detection¶
Credit: AITS Cainvas Community
Photo by Traci C on Local Guides Connect, Google
Detecting parkinson's disease in patients using speeech signals
import pandas as pd
from sklearn.preprocessing import normalize, MinMaxScaler
from sklearn.metrics import confusion_matrix
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow.keras
import random
import matplotlib.pyplot as plt
Dataset¶
Source:¶
The dataset was created by Max Little of the University of Oxford, in collaboration with the National Centre for Voice and Speech, Denver, Colorado, who recorded the speech signals. The original study published the feature extraction methods for general voice disorders.
Dataset information¶
This dataset is composed of a range of biomedical voice measurements from 31 people, 23 with Parkinson's disease (PD). Each column in the table is a particular voice measure, and each row corresponds one of 195 voice recording from these individuals. The main aim of the data is to discriminate healthy people from those with PD, according to "status" column which is set to 0 for healthy and 1 for PD.
parkinson = pd.read_csv('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/parkinsons.data')
parkinson
# shuffling the dataset because the status column is ordered above
parkinson = parkinson.sample(frac=1, random_state=13)
parkinson
# looking into the classes
parkinson['status'].value_counts()
It is an unbalanced dataset. We will need f1 score to determine the performance of our model.
Processing the data¶
input_columns = list(parkinson.columns)
input_columns.remove('status')
input_columns.remove('name')
#output_columns = ['status'] # use for sigmoid activated last layer of model
output_columns = ['no', 'yes'] # use for one hot encoded data in the last layer
print("Input columns: ", input_columns)
print("Output columns: ", output_columns)
parkinson.describe()
# The range of values in different attributes vary a lot. Thus, we represent them on the same scale using MinMaxScaler
scaler = MinMaxScaler()
parkinson[input_columns] = scaler.fit_transform(parkinson[input_columns])
parkinson.describe()
# one hot encoding the output columns
parkinson[['no', 'yes']] = pd.get_dummies(parkinson.status)
parkinson
# Using 80-10-10 split of train-val-test data
train_df, val_df = train_test_split(parkinson, train_size=0.8) # 80-20 split
val_df, test_df = train_test_split(val_df, train_size = 0.5) #splitting the 20% into 2 halves
print("Train dataset")
print(len(train_df))
print(train_df['status'].value_counts())
print("Val dataset")
print(len(val_df))
print(val_df['status'].value_counts())
print("Test dataset")
print(len(test_df))
print(test_df['status'].value_counts())
# splitting into input and output
xtrain, ytrain = np.array(train_df[input_columns]).astype('float32'), np.array(train_df[output_columns])
xval, yval = np.array(val_df[input_columns]).astype('float32'), np.array(val_df[output_columns])
xtest, ytest = np.array(test_df[input_columns]).astype('float32'), np.array(test_df[output_columns])
Model¶
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(256, activation="relu", input_shape = xtrain[0].shape))
model.add(tf.keras.layers.Dense(128, activation="relu"))
model.add(tf.keras.layers.Dense(64, activation="relu"))
model.add(tf.keras.layers.Dense(2, activation="softmax"))
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
history = model.fit(xtrain, ytrain, epochs = 32, batch_size = 8, validation_data=(xval, yval))
model.evaluate(xtest, ytest)
model.summary()
ypred = model.predict_classes(xtest)
confusion_matrix(np.argmax(ytest, axis = 1), ypred)
From the confusion matrix, it is evident that the high accuracy is not only due to random predictions towards the majority class. Our model is well trained.
Plotting the metrics¶
def plot(history, variable, variable1):
plt.plot(range(len(history[variable])), history[variable])
plt.plot(range(len(history[variable1])), history[variable1])
plt.legend([variable, variable1])
plt.title(variable)
plot(history.history, "accuracy", "val_accuracy")
plot(history.history, "loss", "val_loss")
Prediction¶
# pick random test data sample from one batch
x = random.randint(0, len(xtest)- 1)
pred = model.predict(xtest[x].reshape(1, -1))
diagnosis = np.argmax(pred[0])
print("Actual diagnosis: ", output_columns[np.argmax(ytest[x])])
print("Model diagnosis: ", output_columns[diagnosis], " with probability ", pred[0][diagnosis])
deepC¶
model.save('parkinsons.h5')
!deepCC parkinsons.h5
# pick random test data sample from one batch
x = random.randint(0, len(xtest)- 1)
np.savetxt('sample.data', xtest[x].reshape(1, -1))
# run exe with input
!parkinsons_deepC/parkinsons.exe sample.data
pred = np.loadtxt('dense_3.out')
diagnosis = np.argmax(pred)
print("Actual diagnosis: ", output_columns[np.argmax(ytest[x])])
print("Model diagnosis: ", output_columns[diagnosis], " with probability ", pred[diagnosis])