Cainvas

Stroke Detection using Deep Learning

In [4]:
# Import all the necessary libraries

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Sequential
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

Unzip the Data

In [5]:
!wget 'https://cainvas-static.s3.amazonaws.com/media/user_data/kkharbanda90/archive.zip'

!unzip -qo archive.zip 
!rm archive.zip
--2021-11-17 15:26:21--  https://cainvas-static.s3.amazonaws.com/media/user_data/kkharbanda90/archive.zip
Resolving cainvas-static.s3.amazonaws.com (cainvas-static.s3.amazonaws.com)... 52.219.156.67
Connecting to cainvas-static.s3.amazonaws.com (cainvas-static.s3.amazonaws.com)|52.219.156.67|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 69007 (67K) [application/x-zip-compressed]
Saving to: ‘archive.zip’

archive.zip         100%[===================>]  67.39K  --.-KB/s    in 0.001s  

2021-11-17 15:26:21 (44.8 MB/s) - ‘archive.zip’ saved [69007/69007]

In [6]:
#Loading the data file using pandas library

data = pd.read_csv('healthcare-dataset-stroke-data.csv', sep = ",")
data.head(3)
Out[6]:
id gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
0 9046 Male 67.0 0 1 Yes Private Urban 228.69 36.6 formerly smoked 1
1 51676 Female 61.0 0 0 Yes Self-employed Rural 202.21 NaN never smoked 1
2 31112 Male 80.0 0 1 Yes Private Rural 105.92 32.5 never smoked 1

Next, we will remove the ID column as we will not use that for training the data. After getting rid of irrelevant data, we need to check for NA values.

In [7]:
data=data.drop(["id"], axis=1)
data.isna().sum()
Out[7]:
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

Filling the NA values in the Body Mass Index with the mean value.

In [8]:
data['bmi'] = data['bmi'].fillna(np.mean(data['bmi']))

Visualising The Data

In [9]:
sns.countplot(data = data, x = 'stroke')
plt.title("Stroke Occurance")
Out[9]:
Text(0.5, 1.0, 'Stroke Occurance')
In [10]:
sns.lineplot(data = data, x = 'smoking_status', y = 'age', hue = 'stroke')
plt.title("Effect of Smoking and Age on Stroke")
Out[10]:
Text(0.5, 1.0, 'Effect of Smoking and Age on Stroke')
In [11]:
# Visualising the relationship between different columns of the data
sns.pairplot(data, height = 2)
plt.show()
In [12]:
# Counting the occurance of stroke in the data
sns.countplot(data = data, x = 'ever_married',  hue = 'stroke')
plt.title("Effect of Smoking and Age on Stroke")
Out[12]:
Text(0.5, 1.0, 'Effect of Smoking and Age on Stroke')
In [13]:
fig, ax = plt.subplots(figsize = (20,10))
sns.boxplot(data = data, x = 'smoking_status', y = 'bmi', hue = 'stroke', fliersize = 3)
plt.title("Effect of Smoking and BMI on Stroke", fontdict = {'size' : 25})
plt.xlabel("Smokers or Non-Smokers", fontdict = {'size' : 16})
plt.ylabel("Body Mass Index", fontdict = {'size' : 16})
Out[13]:
Text(0, 0.5, 'Body Mass Index')
In [14]:
# Plotting a heatmap/correlation plot to see how different values are related to each other
plt.figure(figsize=(15,12))
sns.heatmap(data.corr(),annot=True,linewidths=2, center = True)
plt.show()
In [15]:
# Making sure that no NA values are left in the data
data.isna().sum()
Out[15]:
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64
In [16]:
data.head()
Out[16]:
gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
0 Male 67.0 0 1 Yes Private Urban 228.69 36.600000 formerly smoked 1
1 Female 61.0 0 0 Yes Self-employed Rural 202.21 28.893237 never smoked 1
2 Male 80.0 0 1 Yes Private Rural 105.92 32.500000 never smoked 1
3 Female 49.0 0 0 Yes Private Urban 171.23 34.400000 smokes 1
4 Female 79.0 1 0 Yes Self-employed Rural 174.12 24.000000 never smoked 1

Using Label Encoder to transform different categories in the data into numerical format in order to feed the data to the DNN Model

In [17]:
le=LabelEncoder()
data.gender=le.fit_transform(data.gender)
data.ever_married=le.fit_transform(data.ever_married)
data.work_type=le.fit_transform(data.work_type)
data.Residence_type=le.fit_transform(data.Residence_type)
data.smoking_status=le.fit_transform(data.smoking_status)
In [18]:
data.head()
Out[18]:
gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
0 1 67.0 0 1 1 2 1 228.69 36.600000 1 1
1 0 61.0 0 0 1 3 0 202.21 28.893237 2 1
2 1 80.0 0 1 1 2 0 105.92 32.500000 2 1
3 0 49.0 0 0 1 2 1 171.23 34.400000 3 1
4 0 79.0 1 0 1 3 0 174.12 24.000000 2 1
In [19]:
print(data.shape)
X = data.iloc[:,:10].values
y = data.iloc[:,-1].values
(5110, 11)
In [ ]:
 
In [20]:
# Splitting our dataset into train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, y,test_size = 0.4,random_state = 0, stratify = y, shuffle = True)
In [21]:
#Feature Scaling

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
In [22]:
# # convert the data to categorical labels

from tensorflow.keras.utils import to_categorical
Y_train = to_categorical(Y_train, num_classes=None)
Y_test = to_categorical(Y_test, num_classes=None)
print ("Y = ",Y_train.shape)
print ("X = ",X_train.shape)
Y =  (3066, 2)
X =  (3066, 10)
In [23]:
es = EarlyStopping(monitor='val_loss', patience=5)
In [24]:
# Defining the architecture of our deep learning model

model = Sequential()

model.add(Dense(100, activation = "relu", input_dim = 10))
model.add(Dropout(0.3))
model.add(Dense(100, activation = "relu"))
model.add(Dense(50, activation = "relu"))
model.add(Dropout(0.3))
model.add(Dense(40, activation = "relu"))

model.add(Dropout(0.3))
model.add(Dense(2, activation = "softmax"))

model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense (Dense)                (None, 100)               1100      
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 50)                5050      
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 40)                2040      
_________________________________________________________________
dropout_2 (Dropout)          (None, 40)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 82        
=================================================================
Total params: 18,372
Trainable params: 18,372
Non-trainable params: 0
_________________________________________________________________
In [25]:
# Compiling the model
model.compile(optimizer = Adam(lr = 0.0001), loss = 'categorical_crossentropy', metrics = ['accuracy'])
In [26]:
# Run the model for a batch size of 35 for 100 epochs
history = model.fit(X_train, 
                    Y_train, 
                    validation_data = (X_test, Y_test),
                    batch_size = 35,
                    epochs = 100,
                    validation_steps = 10,
                    callbacks = [es]
                   )
Epoch 1/100
88/88 [==============================] - 0s 4ms/step - loss: 0.5105 - accuracy: 0.8581 - val_loss: 0.3523 - val_accuracy: 0.9514
Epoch 2/100
88/88 [==============================] - 0s 2ms/step - loss: 0.2928 - accuracy: 0.9511 - val_loss: 0.2281 - val_accuracy: 0.9514
Epoch 3/100
88/88 [==============================] - 0s 2ms/step - loss: 0.2294 - accuracy: 0.9514 - val_loss: 0.2027 - val_accuracy: 0.9514
Epoch 4/100
88/88 [==============================] - 0s 2ms/step - loss: 0.2140 - accuracy: 0.9514 - val_loss: 0.1886 - val_accuracy: 0.9514
Epoch 5/100
88/88 [==============================] - 0s 2ms/step - loss: 0.2073 - accuracy: 0.9514 - val_loss: 0.1786 - val_accuracy: 0.9514
Epoch 6/100
88/88 [==============================] - 0s 2ms/step - loss: 0.2045 - accuracy: 0.9514 - val_loss: 0.1704 - val_accuracy: 0.9514
Epoch 7/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1958 - accuracy: 0.9514 - val_loss: 0.1650 - val_accuracy: 0.9514
Epoch 8/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1939 - accuracy: 0.9514 - val_loss: 0.1621 - val_accuracy: 0.9514
Epoch 9/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1800 - accuracy: 0.9514 - val_loss: 0.1596 - val_accuracy: 0.9514
Epoch 10/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1865 - accuracy: 0.9514 - val_loss: 0.1581 - val_accuracy: 0.9514
Epoch 11/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1835 - accuracy: 0.9514 - val_loss: 0.1579 - val_accuracy: 0.9514
Epoch 12/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1806 - accuracy: 0.9514 - val_loss: 0.1568 - val_accuracy: 0.9514
Epoch 13/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1774 - accuracy: 0.9514 - val_loss: 0.1570 - val_accuracy: 0.9514
Epoch 14/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1871 - accuracy: 0.9514 - val_loss: 0.1576 - val_accuracy: 0.9514
Epoch 15/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1761 - accuracy: 0.9514 - val_loss: 0.1564 - val_accuracy: 0.9514
Epoch 16/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1717 - accuracy: 0.9514 - val_loss: 0.1561 - val_accuracy: 0.9514
Epoch 17/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1744 - accuracy: 0.9514 - val_loss: 0.1568 - val_accuracy: 0.9514
Epoch 18/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1718 - accuracy: 0.9514 - val_loss: 0.1563 - val_accuracy: 0.9514
Epoch 19/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1708 - accuracy: 0.9514 - val_loss: 0.1565 - val_accuracy: 0.9514
Epoch 20/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1718 - accuracy: 0.9514 - val_loss: 0.1578 - val_accuracy: 0.9514
Epoch 21/100
88/88 [==============================] - 0s 2ms/step - loss: 0.1713 - accuracy: 0.9514 - val_loss: 0.1565 - val_accuracy: 0.9514
In [27]:
# Function to plot "accuracy vs epoch" graphs and "loss vs epoch" graphs for training and validation data
def plot_metrics(model_name, metric = 'accuracy'):
    if metric == 'loss':
        plt.title("Loss Values")
        plt.plot(model_name.history['loss'], label = 'train')
        plt.plot(model_name.history['val_loss'], label = 'test')
        plt.legend()
        plt.show()
    else:
        plt.title("Accuracy Values")
        plt.plot(model_name.history['accuracy'], label='train') 
        plt.plot(model_name.history['val_accuracy'], label='test') 
        plt.legend()
        plt.show()
In [28]:
plot_metrics(history, 'accuracy')
plot_metrics(history, 'loss')
In [29]:
# Saving our trained model
from tensorflow.keras.models import save_model
if os.path.isfile('best_model.h5') is False:
    model.save('best_model.h5')
In [30]:
#Plotting a confusion matrix for checking the performance of our model
Y_pred = np.argmax(model.predict(X_test), axis = 1)
cnf = confusion_matrix(Y_test.argmax(axis = 1), Y_pred)


df_cnf = pd.DataFrame(cnf, range(2), range(2))
sns.set(font_scale = 2)
sns.heatmap(df_cnf, annot = True)
plt.title("Confusion Matrix")
plt.xlabel("True Values")
plt.ylabel("Prediction Values")
plt.show()
In [31]:
!deepCC best_model.h5
[INFO]
Reading [keras model] 'best_model.h5'
[SUCCESS]
Saved 'best_model_deepC/best_model.onnx'
[INFO]
Reading [onnx model] 'best_model_deepC/best_model.onnx'
[INFO]
Model info:
  ir_vesion : 4
  doc       : 
[WARNING]
[ONNX]: terminal (input/output) dense_input's shape is less than 1. Changing it to 1.
[WARNING]
[ONNX]: terminal (input/output) dense_4's shape is less than 1. Changing it to 1.
[INFO]
Running DNNC graph sanity check ...
[SUCCESS]
Passed sanity check.
[INFO]
Writing C++ file 'best_model_deepC/best_model.cpp'
[INFO]
deepSea model files are ready in 'best_model_deepC/' 
[RUNNING COMMAND]
g++ -std=c++11 -O3 -fno-rtti -fno-exceptions -I. -I/opt/tljh/user/lib/python3.7/site-packages/deepC-0.13-py3.7-linux-x86_64.egg/deepC/include -isystem /opt/tljh/user/lib/python3.7/site-packages/deepC-0.13-py3.7-linux-x86_64.egg/deepC/packages/eigen-eigen-323c052e1731 "best_model_deepC/best_model.cpp" -D_AITS_MAIN -o "best_model_deepC/best_model.exe"
[RUNNING COMMAND]
size "best_model_deepC/best_model.exe"
   text	   data	    bss	    dec	    hex	filename
 200085	   2984	    760	 203829	  31c35	best_model_deepC/best_model.exe
[SUCCESS]
Saved model as executable "best_model_deepC/best_model.exe"
In [ ]:
 
In [ ]: