Stroke Detection using Deep Learning¶
In [4]:
# Import all the necessary libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Sequential
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
Unzip the Data¶
In [5]:
!wget 'https://cainvas-static.s3.amazonaws.com/media/user_data/kkharbanda90/archive.zip'
!unzip -qo archive.zip
!rm archive.zip
In [6]:
#Loading the data file using pandas library
data = pd.read_csv('healthcare-dataset-stroke-data.csv', sep = ",")
data.head(3)
Out[6]:
Next, we will remove the ID column as we will not use that for training the data. After getting rid of irrelevant data, we need to check for NA values.
In [7]:
data=data.drop(["id"], axis=1)
data.isna().sum()
Out[7]:
Filling the NA values in the Body Mass Index with the mean value.
In [8]:
data['bmi'] = data['bmi'].fillna(np.mean(data['bmi']))
Visualising The Data¶
In [9]:
sns.countplot(data = data, x = 'stroke')
plt.title("Stroke Occurance")
Out[9]:
In [10]:
sns.lineplot(data = data, x = 'smoking_status', y = 'age', hue = 'stroke')
plt.title("Effect of Smoking and Age on Stroke")
Out[10]:
In [11]:
# Visualising the relationship between different columns of the data
sns.pairplot(data, height = 2)
plt.show()
In [12]:
# Counting the occurance of stroke in the data
sns.countplot(data = data, x = 'ever_married', hue = 'stroke')
plt.title("Effect of Smoking and Age on Stroke")
Out[12]:
In [13]:
fig, ax = plt.subplots(figsize = (20,10))
sns.boxplot(data = data, x = 'smoking_status', y = 'bmi', hue = 'stroke', fliersize = 3)
plt.title("Effect of Smoking and BMI on Stroke", fontdict = {'size' : 25})
plt.xlabel("Smokers or Non-Smokers", fontdict = {'size' : 16})
plt.ylabel("Body Mass Index", fontdict = {'size' : 16})
Out[13]:
In [14]:
# Plotting a heatmap/correlation plot to see how different values are related to each other
plt.figure(figsize=(15,12))
sns.heatmap(data.corr(),annot=True,linewidths=2, center = True)
plt.show()
In [15]:
# Making sure that no NA values are left in the data
data.isna().sum()
Out[15]:
In [16]:
data.head()
Out[16]:
Using Label Encoder to transform different categories in the data into numerical format in order to feed the data to the DNN Model
In [17]:
le=LabelEncoder()
data.gender=le.fit_transform(data.gender)
data.ever_married=le.fit_transform(data.ever_married)
data.work_type=le.fit_transform(data.work_type)
data.Residence_type=le.fit_transform(data.Residence_type)
data.smoking_status=le.fit_transform(data.smoking_status)
In [18]:
data.head()
Out[18]:
In [19]:
print(data.shape)
X = data.iloc[:,:10].values
y = data.iloc[:,-1].values
In [ ]:
In [20]:
# Splitting our dataset into train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, y,test_size = 0.4,random_state = 0, stratify = y, shuffle = True)
In [21]:
#Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
In [22]:
# # convert the data to categorical labels
from tensorflow.keras.utils import to_categorical
Y_train = to_categorical(Y_train, num_classes=None)
Y_test = to_categorical(Y_test, num_classes=None)
print ("Y = ",Y_train.shape)
print ("X = ",X_train.shape)
In [23]:
es = EarlyStopping(monitor='val_loss', patience=5)
In [24]:
# Defining the architecture of our deep learning model
model = Sequential()
model.add(Dense(100, activation = "relu", input_dim = 10))
model.add(Dropout(0.3))
model.add(Dense(100, activation = "relu"))
model.add(Dense(50, activation = "relu"))
model.add(Dropout(0.3))
model.add(Dense(40, activation = "relu"))
model.add(Dropout(0.3))
model.add(Dense(2, activation = "softmax"))
model.summary()
In [25]:
# Compiling the model
model.compile(optimizer = Adam(lr = 0.0001), loss = 'categorical_crossentropy', metrics = ['accuracy'])
In [26]:
# Run the model for a batch size of 35 for 100 epochs
history = model.fit(X_train,
Y_train,
validation_data = (X_test, Y_test),
batch_size = 35,
epochs = 100,
validation_steps = 10,
callbacks = [es]
)
In [27]:
# Function to plot "accuracy vs epoch" graphs and "loss vs epoch" graphs for training and validation data
def plot_metrics(model_name, metric = 'accuracy'):
if metric == 'loss':
plt.title("Loss Values")
plt.plot(model_name.history['loss'], label = 'train')
plt.plot(model_name.history['val_loss'], label = 'test')
plt.legend()
plt.show()
else:
plt.title("Accuracy Values")
plt.plot(model_name.history['accuracy'], label='train')
plt.plot(model_name.history['val_accuracy'], label='test')
plt.legend()
plt.show()
In [28]:
plot_metrics(history, 'accuracy')
plot_metrics(history, 'loss')
In [29]:
# Saving our trained model
from tensorflow.keras.models import save_model
if os.path.isfile('best_model.h5') is False:
model.save('best_model.h5')
In [30]:
#Plotting a confusion matrix for checking the performance of our model
Y_pred = np.argmax(model.predict(X_test), axis = 1)
cnf = confusion_matrix(Y_test.argmax(axis = 1), Y_pred)
df_cnf = pd.DataFrame(cnf, range(2), range(2))
sns.set(font_scale = 2)
sns.heatmap(df_cnf, annot = True)
plt.title("Confusion Matrix")
plt.xlabel("True Values")
plt.ylabel("Prediction Values")
plt.show()
In [31]:
!deepCC best_model.h5
In [ ]:
In [ ]: