NOTE: This Use Case is not purposed for resource constrained devices.
Obtaining the dataset¶
In [1]:
!wget -N https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/Pneumothoraxdataset.zip
!unzip -qo Pneumothoraxdataset.zip
dir = 'small_train_data_set/small_train_data_set'
Importing the required libraries¶
In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import cv2
from sklearn.model_selection import train_test_split
import sklearn
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
In [3]:
csv_path = 'small_train_data_set/small_train_data_set/train_data.csv'
df = pd.read_csv(csv_path)
#Randomly shuffling the dataset
df = df.sample(frac=1)
#Checking how our dataframe looks like
df.head()
Out[3]:
In [4]:
images = []
labels = []
# Reading the images and saving it in a numpy array
for i in df['file_name']:
path = os.path.join(dir, i)
img = cv2.imread(path)
img = cv2.resize(img, (96,96))
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
images.append(img)
# Saving the labels in a numpy array
for i in df['target']:
labels.append(i)
images = [i/255 for i in images]
X = np.array(images)
y = np.array(labels)
Plotting to check the how the dataset looks like
In [5]:
d = {0:'NO PNEUMOTHORAX', 1:'PNEUMOTHORAX'}
fig,axes = plt.subplots(3,3)
fig.subplots_adjust(0,0,2,2)
for i in range(3):
for j in range(3):
num = random.randint(0, len(images))
axes[i,j].imshow(images[num])
axes[i,j].set_title("CLASS: "+str(labels[num]) +"\n" + "LABEL:" + str(d[labels[num]]))
axes[i,j].axis('off')
In [6]:
# Checking the balance of the dataset
neg, pos = np.bincount(y)
total = neg + pos
print('Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n Negative: {} ({:.2f}% of total)'.format(
total, pos, 100 * pos / total, neg, 100 * neg / total))
sns.countplot(y)
plt.show()
Clearly the dataset is imbalanced and needs to be handled. Oversampling of the minority dataset is one of the ways to handle class imbalance
In [7]:
from imblearn.over_sampling import RandomOverSampler
reshaped_X = X.reshape(X.shape[0],-1)
#Oversampling
oversample = RandomOverSampler()
oversampled_X, oversampled_y = oversample.fit_resample(reshaped_X , y)
new_X = oversampled_X.reshape(-1,96,96,3)
In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.1,stratify=y)
In [9]:
# Splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(new_X, oversampled_y, shuffle=True, test_size=0.1,stratify=oversampled_y)
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
sns.countplot(ax=axes[0], x = y_train)
axes[0].set_title('Training data labels', fontsize = 14)
sns.countplot(ax=axes[1], x = y_test)
axes[1].set_title('Testing data labels', fontsize = 14)
plt.show()
Now the data is perfectly balanced and we can proceed towards building a model
In [10]:
# Training a simple convolutional model
model = keras.Sequential(
[
layers.Conv2D(8, input_shape=(96,96,3),padding="same",kernel_size=(3, 3), activation="relu"),
layers.MaxPooling2D(pool_size=(2, 2)),
layers.Conv2D(16, kernel_size=(3, 3), padding="same",activation="relu"),
layers.MaxPooling2D(pool_size=(2, 2)),
layers.Conv2D(32, kernel_size=(3, 3), padding="same",activation="relu"),
layers.MaxPooling2D(pool_size=(2, 2)),
layers.Conv2D(32, kernel_size=(3, 3),padding="same",activation="relu"),
layers.MaxPooling2D(pool_size=(2, 2)),
layers.Conv2D(64, kernel_size=(3, 3),padding="same",activation="relu"),
layers.MaxPooling2D(pool_size=(2, 2)),
layers.Conv2D(64, kernel_size=(3, 3),padding="same",activation="relu"),
layers.MaxPooling2D(pool_size=(2, 2)),
# layers.Conv2D(128, kernel_size=(3, 3),padding="same",activation="relu"),
# layers.MaxPooling2D(pool_size=(2, 2)),
layers.Flatten(),
layers.Dropout(0.3),
layers.Dense(1, activation="sigmoid"),
]
)
model.summary()
In [11]:
# Defining the hyperparameters
LOSS = keras.losses.BinaryCrossentropy()
LEARNING_RATE = 1e-4 #Choosing a smaller learning rate for a smoother curve
OPTIMIZER = keras.optimizers.Adam(LEARNING_RATE)
BATCH_SIZE = 64
EPOCHS = 100
In [12]:
model.compile(loss=LOSS, optimizer=OPTIMIZER, metrics=['AUC'])
history=model.fit(x=X_train, y=y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.2)
In [13]:
#Accuracy Plot
plt.plot(history.history['auc'])
plt.plot(history.history['val_auc'])
plt.title('Model AUC')
plt.ylabel('AUC')
plt.xlabel('Epoch')
plt.legend(['train', 'val'], loc='center right')
plt.show()
#Loss Plot
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['train', 'val'], loc='upper right')
plt.show()
In [14]:
score = model.evaluate(X_test, y_test, verbose=0)
print(f'Test loss: {score[0]}\nTest AUC: {score[1] * 100}')
In [15]:
#Predictions on random samples in test set
test = X_test.reshape(-1,96,96,3)
predictions = model.predict_classes(test)
# predictions = predictions > 0.5
# summarize the first 5 cases
# for i in range(10):
# print(' predicted => %d (expected %d)' % (predictions[i], y_test[i]))
In [16]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
matrix = confusion_matrix(y_test,predictions, labels=[0,1])
print('Confusion matrix : \n',matrix)
matrix = classification_report(y_test,predictions,labels=[0,1])
print('Classification report : \n',matrix)
In [17]:
y_pred = (predictions > 0.5)
arr = sklearn.metrics.confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(arr, ['absent','present'], ['absent','present'])
# plt.figure(figsize=(10,7))
sns.set(font_scale=1) # for label size
sns.heatmap(df_cm, annot=True, annot_kws={"size": 16})
Out[17]:
In [18]:
#Saving the model
model.save('pneumo.h5')
In [19]:
!deepCC pneumo.h5