Stroke Detection using Deep Learning¶

# Import all the necessary libraries

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Sequential
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

Unzip the Data¶

!wget 'https://cainvas-static.s3.amazonaws.com/media/user_data/kkharbanda90/archive.zip'

!unzip -qo archive.zip 
!rm archive.zip

--2021-11-17 15:26:21--  https://cainvas-static.s3.amazonaws.com/media/user_data/kkharbanda90/archive.zip
Resolving cainvas-static.s3.amazonaws.com (cainvas-static.s3.amazonaws.com)... 52.219.156.67
Connecting to cainvas-static.s3.amazonaws.com (cainvas-static.s3.amazonaws.com)|52.219.156.67|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 69007 (67K) [application/x-zip-compressed]
Saving to: ‘archive.zip’

archive.zip         100%[===================>]  67.39K  --.-KB/s    in 0.001s  

2021-11-17 15:26:21 (44.8 MB/s) - ‘archive.zip’ saved [69007/69007]

#Loading the data file using pandas library

data = pd.read_csv('healthcare-dataset-stroke-data.csv', sep = ",")
data.head(3)

Next, we will remove the ID column as we will not use that for training the data. After getting rid of irrelevant data, we need to check for NA values.

data=data.drop(["id"], axis=1)
data.isna().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

Filling the NA values in the Body Mass Index with the mean value.

data['bmi'] = data['bmi'].fillna(np.mean(data['bmi']))

Visualising The Data¶

sns.countplot(data = data, x = 'stroke')
plt.title("Stroke Occurance")

Text(0.5, 1.0, 'Stroke Occurance')

sns.lineplot(data = data, x = 'smoking_status', y = 'age', hue = 'stroke')
plt.title("Effect of Smoking and Age on Stroke")

Text(0.5, 1.0, 'Effect of Smoking and Age on Stroke')

# Visualising the relationship between different columns of the data
sns.pairplot(data, height = 2)
plt.show()

# Counting the occurance of stroke in the data
sns.countplot(data = data, x = 'ever_married',  hue = 'stroke')
plt.title("Effect of Smoking and Age on Stroke")

Text(0.5, 1.0, 'Effect of Smoking and Age on Stroke')

fig, ax = plt.subplots(figsize = (20,10))
sns.boxplot(data = data, x = 'smoking_status', y = 'bmi', hue = 'stroke', fliersize = 3)
plt.title("Effect of Smoking and BMI on Stroke", fontdict = {'size' : 25})
plt.xlabel("Smokers or Non-Smokers", fontdict = {'size' : 16})
plt.ylabel("Body Mass Index", fontdict = {'size' : 16})

Text(0, 0.5, 'Body Mass Index')

# Plotting a heatmap/correlation plot to see how different values are related to each other
plt.figure(figsize=(15,12))
sns.heatmap(data.corr(),annot=True,linewidths=2, center = True)
plt.show()

# Making sure that no NA values are left in the data
data.isna().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

data.head()

Using Label Encoder to transform different categories in the data into numerical format in order to feed the data to the DNN Model

le=LabelEncoder()
data.gender=le.fit_transform(data.gender)
data.ever_married=le.fit_transform(data.ever_married)
data.work_type=le.fit_transform(data.work_type)
data.Residence_type=le.fit_transform(data.Residence_type)
data.smoking_status=le.fit_transform(data.smoking_status)

data.head()

print(data.shape)
X = data.iloc[:,:10].values
y = data.iloc[:,-1].values

(5110, 11)

# Splitting our dataset into train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, y,test_size = 0.4,random_state = 0, stratify = y, shuffle = True)

#Feature Scaling

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# # convert the data to categorical labels

from tensorflow.keras.utils import to_categorical
Y_train = to_categorical(Y_train, num_classes=None)
Y_test = to_categorical(Y_test, num_classes=None)
print ("Y = ",Y_train.shape)
print ("X = ",X_train.shape)

Y =  (3066, 2)
X =  (3066, 10)

es = EarlyStopping(monitor='val_loss', patience=5)

# Defining the architecture of our deep learning model

model = Sequential()

model.add(Dense(100, activation = "relu", input_dim = 10))
model.add(Dropout(0.3))
model.add(Dense(100, activation = "relu"))
model.add(Dense(50, activation = "relu"))
model.add(Dropout(0.3))
model.add(Dense(40, activation = "relu"))

model.add(Dropout(0.3))
model.add(Dense(2, activation = "softmax"))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense (Dense)                (None, 100)               1100      
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 50)                5050      
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 40)                2040      
_________________________________________________________________
dropout_2 (Dropout)          (None, 40)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 82        
=================================================================
Total params: 18,372
Trainable params: 18,372
Non-trainable params: 0
_________________________________________________________________

# Compiling the model
model.compile(optimizer = Adam(lr = 0.0001), loss = 'categorical_crossentropy', metrics = ['accuracy'])

# Run the model for a batch size of 35 for 100 epochs
history = model.fit(X_train, 
                    Y_train, 
                    validation_data = (X_test, Y_test),
                    batch_size = 35,
                    epochs = 100,
                    validation_steps = 10,
                    callbacks = [es]
                   )

Epoch 1/100
88/88 [==============================] - 0s 4ms/step - loss: 0.5105 - accuracy: 0.8581 - val_loss: 0.3523 - val_accuracy: 0.9514
Epoch 2/100
88/88 [==============================] - 0s 2ms/step - loss: 0.2928 - accuracy: 0.9511 - val_loss: 0.2281 - val_accuracy: 0.9514
Epoch 3/100
88/88 [==============================] - 0s 2ms/step - loss: 0.2294 - accuracy: 0.9514 - val_loss: 0.2027 - val_accuracy: 0.9514
Epoch 4/100
88/88 [==============================] - 0s 2ms/step - loss: 0.2140 - accuracy: 0.9514 - val_loss: 0.1886 - val_accuracy: 0.9514
Epoch 5/100
88/88 [==============================] - 0s 2ms/step - loss: 0.2073 - accuracy: 0.9514 - val_loss: 0.1786 - val_accuracy: 0.9514
Epoch 6/100
88/88 [==============================] - 0s 2ms/step - loss: 0.2045 - accuracy: 0.9514 - val_loss: 0.1704 - val_accuracy: 0.9514
Epoch 7/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1958 - accuracy: 0.9514 - val_loss: 0.1650 - val_accuracy: 0.9514
Epoch 8/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1939 - accuracy: 0.9514 - val_loss: 0.1621 - val_accuracy: 0.9514
Epoch 9/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1800 - accuracy: 0.9514 - val_loss: 0.1596 - val_accuracy: 0.9514
Epoch 10/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1865 - accuracy: 0.9514 - val_loss: 0.1581 - val_accuracy: 0.9514
Epoch 11/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1835 - accuracy: 0.9514 - val_loss: 0.1579 - val_accuracy: 0.9514
Epoch 12/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1806 - accuracy: 0.9514 - val_loss: 0.1568 - val_accuracy: 0.9514
Epoch 13/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1774 - accuracy: 0.9514 - val_loss: 0.1570 - val_accuracy: 0.9514
Epoch 14/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1871 - accuracy: 0.9514 - val_loss: 0.1576 - val_accuracy: 0.9514
Epoch 15/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1761 - accuracy: 0.9514 - val_loss: 0.1564 - val_accuracy: 0.9514
Epoch 16/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1717 - accuracy: 0.9514 - val_loss: 0.1561 - val_accuracy: 0.9514
Epoch 17/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1744 - accuracy: 0.9514 - val_loss: 0.1568 - val_accuracy: 0.9514
Epoch 18/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1718 - accuracy: 0.9514 - val_loss: 0.1563 - val_accuracy: 0.9514
Epoch 19/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1708 - accuracy: 0.9514 - val_loss: 0.1565 - val_accuracy: 0.9514
Epoch 20/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1718 - accuracy: 0.9514 - val_loss: 0.1578 - val_accuracy: 0.9514
Epoch 21/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1713 - accuracy: 0.9514 - val_loss: 0.1565 - val_accuracy: 0.9514

# Function to plot "accuracy vs epoch" graphs and "loss vs epoch" graphs for training and validation data
def plot_metrics(model_name, metric = 'accuracy'):
    if metric == 'loss':
        plt.title("Loss Values")
        plt.plot(model_name.history['loss'], label = 'train')
        plt.plot(model_name.history['val_loss'], label = 'test')
        plt.legend()
        plt.show()
    else:
        plt.title("Accuracy Values")
        plt.plot(model_name.history['accuracy'], label='train') 
        plt.plot(model_name.history['val_accuracy'], label='test') 
        plt.legend()
        plt.show()

plot_metrics(history, 'accuracy')
plot_metrics(history, 'loss')

# Saving our trained model
from tensorflow.keras.models import save_model
if os.path.isfile('best_model.h5') is False:
    model.save('best_model.h5')

#Plotting a confusion matrix for checking the performance of our model
Y_pred = np.argmax(model.predict(X_test), axis = 1)
cnf = confusion_matrix(Y_test.argmax(axis = 1), Y_pred)


df_cnf = pd.DataFrame(cnf, range(2), range(2))
sns.set(font_scale = 2)
sns.heatmap(df_cnf, annot = True)
plt.title("Confusion Matrix")
plt.xlabel("True Values")
plt.ylabel("Prediction Values")
plt.show()

!deepCC best_model.h5

[INFO]
Reading [keras model] 'best_model.h5'
[SUCCESS]
Saved 'best_model_deepC/best_model.onnx'
[INFO]
Reading [onnx model] 'best_model_deepC/best_model.onnx'
[INFO]
Model info:
  ir_vesion : 4
  doc       : 
[WARNING]
[ONNX]: terminal (input/output) dense_input's shape is less than 1. Changing it to 1.
[WARNING]
[ONNX]: terminal (input/output) dense_4's shape is less than 1. Changing it to 1.
[INFO]
Running DNNC graph sanity check ...
[SUCCESS]
Passed sanity check.
[INFO]
Writing C++ file 'best_model_deepC/best_model.cpp'
[INFO]
deepSea model files are ready in 'best_model_deepC/' 
[RUNNING COMMAND]
g++ -std=c++11 -O3 -fno-rtti -fno-exceptions -I. -I/opt/tljh/user/lib/python3.7/site-packages/deepC-0.13-py3.7-linux-x86_64.egg/deepC/include -isystem /opt/tljh/user/lib/python3.7/site-packages/deepC-0.13-py3.7-linux-x86_64.egg/deepC/packages/eigen-eigen-323c052e1731 "best_model_deepC/best_model.cpp" -D_AITS_MAIN -o "best_model_deepC/best_model.exe"
[RUNNING COMMAND]
size "best_model_deepC/best_model.exe"
   text	   data	    bss	    dec	    hex	filename
 200085	   2984	    760	 203829	  31c35	best_model_deepC/best_model.exe
[SUCCESS]
Saved model as executable "best_model_deepC/best_model.exe"

	id	gender	age	heart_disease	ever_married	work_type	Residence_type	avg_glucose_level	bmi	smoking_status	stroke
0	9046	Male	67.0	1	Yes	Private	Urban	228.69	36.6	formerly smoked	1
1	51676	Female	61.0	0	Yes	Self-employed	Rural	202.21	NaN	never smoked	1
2	31112	Male	80.0	1	Yes	Private	Rural	105.92	32.5	never smoked	1

	gender	age	hypertension	heart_disease	ever_married	work_type	Residence_type	avg_glucose_level	bmi	smoking_status	stroke
0	Male	67.0	0	1	Yes	Private	Urban	228.69	36.600000	formerly smoked	1
1	Female	61.0	0	0	Yes	Self-employed	Rural	202.21	28.893237	never smoked	1
2	Male	80.0	0	1	Yes	Private	Rural	105.92	32.500000	never smoked	1
3	Female	49.0	0	0	Yes	Private	Urban	171.23	34.400000	smokes	1
4	Female	79.0	1	0	Yes	Self-employed	Rural	174.12	24.000000	never smoked	1

	gender	age	hypertension	heart_disease	ever_married	work_type	Residence_type	avg_glucose_level	bmi	smoking_status	stroke
0	1	67.0	0	1	1	2	1	228.69	36.600000	1	1
1	0	61.0	0	0	1	3	0	202.21	28.893237	2	1
2	1	80.0	0	1	1	2	0	105.92	32.500000	2	1
3	0	49.0	0	0	1	2	1	171.23	34.400000	3	1
4	0	79.0	1	0	1	3	0	174.12	24.000000	2	1