In [1]:
# Import all the necessary libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Sequential
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
Unzip the Dataset¶
In [2]:
!wget 'https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/Wine_dataset.zip'
!unzip -qo Wine_dataset.zip
!rm Wine_dataset.zip
In [3]:
#Loading the data file using pandas library
data = pd.read_csv('winequality-red.csv', sep = ",")
data.head(10)
Out[3]:
Checking for NULL values¶
In [4]:
data.isna().sum()
Out[4]:
Data Visualization¶
In [5]:
# Checking for quality distribution in the dataset
sns.countplot(data = data, x = 'quality')
plt.title("Quality Distribution")
plt.xlabel("Quality Level")
plt.ylabel("Count")
Out[5]:
Since the quality is distribution is not ideal and several quality levels (5 & 6) being highly over represented in the data, let us pre-process this and make this data a two class problem with 1 class containing quality level { 3, 4, 5 } and the other class cotaining quality levels { 6, 7, 8 }
Class 0: { 3, 4, 5 } Class 1: { 6, 7, 8 }
In [6]:
# Creating a new quality- level column
data['quality_level'] = data['quality'].apply(lambda x: 1 if x > 5 else 0)
X = data.drop(columns=['quality', 'quality_level'], axis=1)
y = data['quality_level'].values
In [7]:
sns.countplot(data = data, x = 'quality_level')
plt.title("Quality Distribution")
plt.xlabel("Quality Level")
plt.ylabel("Count")
Out[7]:
After checking the graph above, we conclude that the data is evenly distributed now and the quality classification will be more accurate now.
Effect of alcohol on wine quality¶
In [8]:
# Effect of alcohol level on quality of wine
sns.lineplot(data = data, x = 'quality', y = 'alcohol')
Out[8]:
Plotting Pair Plots¶
In [9]:
# Visualising the relationship between different columns of the data
sns.pairplot(data)
plt.show()
In [10]:
plt.figure(figsize=(16,7))
sns.barplot(x="quality",y="fixed acidity",data=data)
plt.xlabel("Wine Quality", fontdict = {'size' : 13})
plt.ylabel("Acidity Level", fontdict = {'size' : 13})
plt.title("Effect of Acidity Level", fontdict = {'size' : 18})
Out[10]:
Understanding effect of pH level on wine quality¶
In [11]:
plt.figure(figsize = (10,10))
plt.scatter( data['free sulfur dioxide'], data['pH'],c = data['quality_level'], alpha = 0.7)
plt.xlabel("Free Sulpher Di-Oxide Level")
plt.ylabel("pH Level")
plt.grid(True)
In [ ]:
In [12]:
print(data.head())
print("Shape of data is ", data.shape)
In [13]:
# Splitting our dataset into train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, y,test_size = 0.25,random_state = 0, stratify = y, shuffle = True)
Feature Scaling¶
In [14]:
sc = MinMaxScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
In [15]:
# # convert the data to categorical labels
from tensorflow.keras.utils import to_categorical
Y_train = to_categorical(Y_train, num_classes=None)
Y_test = to_categorical(Y_test, num_classes=None)
print ("Y = ",Y_train.shape)
print ("X = ",X_train.shape)
In [16]:
es = EarlyStopping(monitor='val_loss', patience=5)
Model Architecture¶
In [17]:
# Defining the architecture of our deep learning model
model = Sequential()
model.add(Dense(200, activation = "relu", input_dim = 11))
model.add(Dense(200, activation = "relu"))
model.add(Dense(200, activation = "relu"))
model.add(Dense(200, activation = "relu"))
model.add(Dropout(0.2))
model.add(Dense(150, activation = "relu"))
model.add(Dense(150, activation = "relu"))
model.add(Dense(150, activation = "relu"))
model.add(Dropout(0.2))
model.add(Dense(100, activation = "relu"))
model.add(Dense(100, activation = "relu"))
model.add(Dense(100, activation = "relu"))
model.add(Dense(50, activation = "relu"))
model.add(Dense(50, activation = "relu"))
model.add(Dropout(0.2))
model.add(Dense(25, activation = "relu"))
model.add(Dense(2, activation = "sigmoid"))
model.summary()
In [18]:
# Compiling the model
model.compile(optimizer = Adam(lr = 0.001), loss = 'categorical_crossentropy', metrics = ['accuracy'])
In [19]:
# Run the model for a batch size of 35 for 100 epochs
history = model.fit(X_train,
Y_train,
validation_data = (X_test, Y_test),
batch_size = 25,
epochs = 100,
validation_steps = 10,
callbacks = [es]
)
In [20]:
# Function to plot "accuracy vs epoch" graphs and "loss vs epoch" graphs for training and validation data
def plot_metrics(model_name, metric = 'accuracy'):
if metric == 'loss':
plt.title("Loss Values")
plt.plot(model_name.history['loss'], label = 'train')
plt.plot(model_name.history['val_loss'], label = 'test')
plt.legend()
plt.show()
else:
plt.title("Accuracy Values")
plt.plot(model_name.history['accuracy'], label='train')
plt.plot(model_name.history['val_accuracy'], label='test')
plt.legend()
plt.show()
In [21]:
plot_metrics(history, 'accuracy')
plot_metrics(history, 'loss')
In [26]:
# Saving our trained model
from tensorflow.keras.models import save_model
if os.path.isfile('best_model.h5') is False:
model.save('best_wine.h5')
model.save('best_wine.h5')
Plotting a confusion matrix for checking the performance of our model¶
In [23]:
Y_pred = np.argmax(model.predict(X_test), axis = 1)
cnf = confusion_matrix(Y_test.argmax(axis = 1), Y_pred)
df_cnf = pd.DataFrame(cnf, range(2), range(2))
sns.set(font_scale = 2)
sns.heatmap(df_cnf, annot = True)
plt.title("Confusion Matrix")
plt.xlabel("True Values")
plt.ylabel("Prediction Values")
plt.show()
In [24]:
!deepCC best_model.h5