Training a deep learning model to prescribe a drug based on the patient's data.
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
import random
import matplotlib.pyplot as plt
Dataset¶
On Kaggle by Pratham Tripathi.
The datatset is a CSV file with the features regarding a patient that affects drug prescriptions like age, sex, BP level, cholestrol, and sodium-potassium ratio and the corresponding drug prescribes in each case.
df = pd.read_csv('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/drug200.csv')
df
df['Drug'].value_counts()
This is an unbalanced dataset.
Preprocessing¶
Balancing the dataset¶
In order to balance the dataset, there are two options,
- upsampling - resample the values to make their count equal to the class label with the higher count (here, 1655).
- downsampling - pick n samples from each class label where n = number of samples in class with least count (here, 176)
Here, we will be upsampling.
categories = np.unique(df.Drug.to_list())
df_balanced = pd.DataFrame()
for i in range(len(categories)):
# separating into individual dataframes, one for each class
dfi = df[df['Drug'] == categories[i]]
# resampling
dfi = dfi.sample(91, replace = True)
# appending all to one to form a final balanced dataframe
df_balanced = df_balanced.append(dfi)
df_balanced['Drug'].value_counts()
Categorical variables¶
The 'sex' column does not define a range and thus is one-hot encoded while changing from a categorical variable to a numerical attribute
dfx = pd.get_dummies(df_balanced[df_balanced.columns[:-1]], drop_first = True, columns = ['Sex'])
dfx
print("Values in BP column:", np.unique(dfx['BP']))
print("Values in Cholesterol column:", np.unique(dfx['Cholesterol']))
The values in the columns Cholesterol and BP represent range as seen by the values above.
le_bp = LabelEncoder()
le_bp.fit(['LOW', 'NORMAL', 'HIGH'])
dfx['BP'] = le_bp.transform(dfx['BP'], )
print("BP classes:", le_bp.classes_)
le_ch = LabelEncoder()
le_ch.fit(['NORMAL', 'HIGH'])
dfx['Cholesterol'] = le_ch.transform(dfx['Cholesterol'])
print("Cholesterol classes:", le_ch.classes_)
print(dfx)
Since this is a classification problem, the output of the model which is now as an integer should be one-hot encoded.
df_cat = pd.get_dummies(df_balanced['Drug'])
df_cat
# defining the input and output columns to separate the dataset in the later cells.
input_columns = dfx.columns.to_list()
output_columns = df_cat.columns.to_list()
print("Number of input columns: ", len(input_columns))
#print("Input columns: ", ', '.join(input_columns))
print("Number of output columns: ", len(output_columns))
#print("Output columns: ", ', '.join(output_columns))
for i in output_columns:
dfx[i] = df_cat[i]
del df_cat
dfx
Train test split¶
# Splitting into train, val and test set -- 80-10-10 split
# First, an 80-20 split
train_df, val_test_df = train_test_split(dfx, test_size = 0.2, random_state = 13)
# Then split the 20% into half
val_df, test_df = train_test_split(val_test_df, test_size = 0.5, random_state = 13)
print("Number of samples in...")
print("Training set: ", len(train_df))
print("Validation set: ", len(val_df))
print("Testing set: ", len(test_df))
# Splitting into X (input) and y (output)
Xtrain, ytrain = np.array(train_df[input_columns]), np.array(train_df[output_columns])
Xval, yval = np.array(val_df[input_columns]), np.array(val_df[output_columns])
Xtest, ytest = np.array(test_df[input_columns]), np.array(test_df[output_columns])
Scaling the values¶
# Each feature has a different range.
# Using min_max_scaler to scale them to values in the range [0,1].
min_max_scaler = MinMaxScaler()
# Fit on training set alone
Xtrain = min_max_scaler.fit_transform(Xtrain)
# Use it to transform val and test input
Xval = min_max_scaler.transform(Xval)
Xtest = min_max_scaler.transform(Xtest)
Model¶
model = Sequential([
Dense(1024, activation = 'relu', input_shape = Xtrain[0].shape),
Dense(512, activation = 'relu'),
Dense(256, activation = 'relu'),
Dense(64, activation = 'relu'),
Dense(len(output_columns), activation = 'softmax')
])
cb = [EarlyStopping(monitor = 'val_loss', patience=8, restore_best_weights=True)]
model.summary()
model.compile(optimizer=Adam(0.01), loss=CategoricalCrossentropy(), metrics=['accuracy'])
history = model.fit(Xtrain, ytrain, validation_data = (Xval, yval), epochs=64, callbacks=cb)
model.evaluate(Xtest, ytest)
cm = confusion_matrix(np.argmax(ytest, axis = 1), np.argmax(model.predict(Xtest), axis = 1))
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
for i in range(cm.shape[1]):
for j in range(cm.shape[0]):
plt.text(j, i, format(cm[i, j], '.2f'), horizontalalignment="center", color="black")
plt.imshow(cm, cmap=plt.cm.Blues)
It is important to keep the accuracy extremely high (100%) as chances cannot be taken with a patient's medication.
Plotting the metrics¶
def plot(history, variable, variable1):
plt.plot(range(len(history[variable])), history[variable])
plt.plot(range(len(history[variable1])), history[variable1])
plt.title(variable)
plt.legend([variable, variable1])
plot(history.history, "accuracy", "val_accuracy")
plot(history.history, "loss", "val_loss")
Prediction¶
gender = ['M', 'F']
def print_sample(x):
print("\nSample:")
sample = np.array(test_df)[x]
print("Age :", sample[0])
print("Sex :", gender[int(sample[4])])
print("Na to K ratio :", sample[3])
print("BP :", le_bp.classes_[int(sample[1])])
print("Cholesterol :", le_ch.classes_[int(sample[2])])
print()
# pick random test data sample from one batch
x = random.randint(0, len(Xtest) - 1)
print_sample(x)
output = model.predict(Xtest[x].reshape(1, -1)) # getting output; input shape (256, 256, 3) --> (1, 256, 256, 3)
pred = np.argmax(output[0]) # finding max
print("Predicted: ", output_columns[pred]) # Picking the label from class_names base don the model output
output_true = np.array(ytest)[x]
print("True: ", output_columns[np.argmax(output_true)])
print("Probability: ", output[0][pred])
deepC¶
model.save('drug.h5')
!deepCC drug.h5
# pick random test data sample from one batch
x = random.randint(0, len(Xtest) - 1)
np.savetxt('sample.data', Xtest[x]) # xth sample into text file
# run exe with input
!drug_deepC/drug.exe sample.data
# show predicted output
nn_out = np.loadtxt('deepSea_result_1.out')
print_sample(x)
pred = np.argmax(nn_out) # finding max
print("Predicted: ", output_columns[pred]) # Picking the label from class_names base don the model output
output_true = np.array(ytest)[x]
print("True: ", output_columns[np.argmax(output_true)])
print("Probability: ", nn_out[pred])