NOTE: This Use Case is not purposed for resource constrained devices.
Detecting Cervical Cancer¶
Credit: AITS Cainvas Community
Photo by Sharon Lee for LottieFiles on Dribbble
In [1]:
# Import all the necessary libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
Loading the Dataset¶
In [2]:
!wget 'https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/cervical_cancer.zip'
!unzip -qo cervical_cancer.zip
!rm cervical_cancer.zip
In [3]:
#Loading the data file using pandas library
data = pd.read_csv('kag_risk_factors_cervical_cancer.csv', sep = ",")
data.head(3)
Out[3]:
Dropping Redundant Data and Checking for NULL Values¶
In [4]:
data = data.drop(columns = ['STDs: Time since first diagnosis','STDs: Time since last diagnosis'])
data = data.replace('?', np.nan)
print(data.isna().sum())
Filling NULL Values with Mean Value of the Data¶
In [5]:
data = data.fillna(data.mean())
data = data.apply(pd.to_numeric)
In [6]:
data.info()
Converting Data Types to Float for Preprocessing¶
In [7]:
# Change all the datatype to be float 64
data['Age'] = data['Age'].astype(float)
data['STDs: Number of diagnosis'] = data['STDs: Number of diagnosis'].astype(float)
data['Dx:Cancer'] = data['Dx:Cancer'].astype(float)
data['Dx:CIN'] = data['Dx:CIN'].astype(float)
data['Dx:HPV'] = data['Dx:HPV'].astype(float)
data['Dx'] = data['Dx'].astype(float)
data['Hinselmann'] = data['Hinselmann'].astype(float)
data['Schiller'] = data['Schiller'].astype(float)
data['Citology'] = data['Citology'].astype(float)
data['Biopsy'] = data['Biopsy'].astype(float)
Creating Column for Sum Total of All Cancers¶
In [8]:
data['count']=data['Hinselmann']+data['Schiller']+data['Citology']+data['Biopsy']
data['result']=np.where(data['count']>0,1,data['count'])
In [9]:
data['result'].unique()
Out[9]:
Visualising the Relationship between Age & No. of Sexual Partners¶
In [10]:
plt.figure(figsize = (8, 5))
plt.xticks(rotation = 60)
sns.barplot(y=data['Age'],x=data['Number of sexual partners'],hue=data['Schiller'])
Out[10]:
In [11]:
plt.figure(figsize = (8, 5))
plt.xticks(rotation = 60)
sns.barplot(y=data['Age'],x=data['Number of sexual partners'],hue=data['Biopsy'])
Out[11]:
In [12]:
data_final = data.drop(columns = ['Hinselmann',
'Schiller',
'Citology',
'Biopsy',
'count',
'STDs:condylomatosis',
'STDs:cervical condylomatosis',
'STDs:vulvo-perineal condylomatosis',
'STDs:syphilis',
'STDs:pelvic inflammatory disease',
'STDs:genital herpes',
'STDs:molluscum contagiosum',
'STDs:AIDS', 'STDs:HIV',
'STDs:Hepatitis B', 'STDs:HPV',
'STDs: Number of diagnosis',
'Dx:Cancer', 'Dx:CIN', 'Dx:HPV'
])
y = data_final['result']
X = data_final.drop(columns = ['result'])
In [13]:
# Plotting a heatmap/correlation plot to see how different values are related to each other
plt.figure(figsize=(15,15))
sns.heatmap(data_final.corr(),annot=True,linewidths=2)
plt.xticks(rotation = 60)
plt.show()
In [14]:
print(X.shape)
print(y.shape)
In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.40, stratify = y)
In [16]:
#Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
In [17]:
# Getting the Final Data Shapes
print("Shape of Training Data")
print ("X = ",X_train.shape)
print ("Y = ",y_train.shape, "\n")
print("Shape of Testing Data")
print ("X = ",X_test.shape)
print ("Y = ",y_test.shape)
Training the Model¶
In [18]:
# Defining the architecture of our deep learning model
model = Sequential()
model.add(Dense(100, activation = "softmax", input_dim = 15))
model.add(Dense(1, activation = "softmax"))
model.summary()
In [19]:
# Compiling the model
model.compile(optimizer = Adam(lr = 0.0000001), loss = 'categorical_crossentropy', metrics = ['accuracy'])
In [20]:
es = EarlyStopping(monitor = 'val_accuracy', patience = 5)
In [21]:
# Run the model for a batch size of 5 for 100 epochs
history = model.fit(X_train,
y_train,
validation_data = (X_test, y_test),
batch_size = 5,
epochs = 100,
callbacks = es
)
Checking Model Training with varying epochs¶
In [22]:
# Function to plot "accuracy vs epoch" graphs and "loss vs epoch" graphs for training and validation data
def plot_metrics(model_name, metric = 'accuracy'):
if metric == 'loss':
plt.title("Loss Values")
plt.plot(model_name.history['loss'], label = 'train')
plt.plot(model_name.history['val_loss'], label = 'test')
plt.legend()
plt.show()
else:
plt.title("Accuracy Values")
plt.plot(model_name.history['accuracy'], label='train')
plt.plot(model_name.history['val_accuracy'], label='test')
plt.legend()
plt.show()
In [23]:
plot_metrics(history, 'accuracy')
plot_metrics(history, 'loss')
In [27]:
# Predicting on the testing data
Y_pred = np.argmax(model.predict(X_test), axis = 1)
from tensorflow.keras.models import save_model
if os.path.isfile('best_model.h5') is False:
model.save('best_model.h5')
Getting the Classification Report¶
In [25]:
# Getting a Classification Report for checking the performance of our model
print(classification_report(y_test, Y_pred, target_names = ['No Cancer', 'Cancer']))