Determining Wine Quality¶

# Import all the necessary libraries

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Sequential
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

Unzip the Dataset¶

!wget 'https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/Wine_dataset.zip'

!unzip -qo Wine_dataset.zip
!rm Wine_dataset.zip

--2021-09-15 07:59:08--  https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/Wine_dataset.zip
Resolving cainvas-static.s3.amazonaws.com (cainvas-static.s3.amazonaws.com)... 52.219.66.40
Connecting to cainvas-static.s3.amazonaws.com (cainvas-static.s3.amazonaws.com)|52.219.66.40|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26176 (26K) [application/x-zip-compressed]
Saving to: ‘Wine_dataset.zip.1’

Wine_dataset.zip.1  100%[===================>]  25.56K  --.-KB/s    in 0.001s  

2021-09-15 07:59:08 (30.0 MB/s) - ‘Wine_dataset.zip.1’ saved [26176/26176]

#Loading the data file using pandas library

data = pd.read_csv('winequality-red.csv', sep = ",")
data.head(10)

Checking for NULL values¶

data.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

Data Visualization¶

# Checking for quality distribution in the dataset
sns.countplot(data = data, x = 'quality')
plt.title("Quality Distribution")
plt.xlabel("Quality Level")
plt.ylabel("Count")

Text(0, 0.5, 'Count')

Since the quality is distribution is not ideal and several quality levels (5 & 6) being highly over represented in the data, let us pre-process this and make this data a two class problem with 1 class containing quality level { 3, 4, 5 } and the other class cotaining quality levels { 6, 7, 8 }

Class 0: { 3, 4, 5 } Class 1: { 6, 7, 8 }

# Creating a new quality- level column
data['quality_level'] = data['quality'].apply(lambda x: 1 if x > 5 else 0)
X = data.drop(columns=['quality', 'quality_level'], axis=1)
y = data['quality_level'].values

sns.countplot(data = data, x = 'quality_level')
plt.title("Quality Distribution")
plt.xlabel("Quality Level")
plt.ylabel("Count")

Text(0, 0.5, 'Count')

After checking the graph above, we conclude that the data is evenly distributed now and the quality classification will be more accurate now.

Effect of alcohol on wine quality¶

# Effect of alcohol level on quality of wine
sns.lineplot(data = data, x = 'quality', y = 'alcohol')

<AxesSubplot:xlabel='quality', ylabel='alcohol'>

Plotting Pair Plots¶

# Visualising the relationship between different columns of the data
sns.pairplot(data)
plt.show()

plt.figure(figsize=(16,7))
sns.barplot(x="quality",y="fixed acidity",data=data)
plt.xlabel("Wine Quality",  fontdict = {'size' : 13})
plt.ylabel("Acidity Level",  fontdict = {'size' : 13})
plt.title("Effect of Acidity Level", fontdict = {'size' : 18})

Text(0.5, 1.0, 'Effect of Acidity Level')

Understanding effect of pH level on wine quality¶

plt.figure(figsize = (10,10))
plt.scatter( data['free sulfur dioxide'], data['pH'],c = data['quality_level'], alpha = 0.7)
plt.xlabel("Free Sulpher Di-Oxide Level")
plt.ylabel("pH Level")
plt.grid(True)

print(data.head())
print("Shape of data is ", data.shape)

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  quality_level  
0      9.4        5              0  
1      9.8        5              0  
2      9.8        5              0  
3      9.8        6              1  
4      9.4        5              0  
Shape of data is  (1599, 13)

# Splitting our dataset into train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, y,test_size = 0.25,random_state = 0, stratify = y, shuffle = True)

Feature Scaling¶

sc = MinMaxScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# # convert the data to categorical labels

from tensorflow.keras.utils import to_categorical
Y_train = to_categorical(Y_train, num_classes=None)
Y_test = to_categorical(Y_test, num_classes=None)
print ("Y = ",Y_train.shape)
print ("X = ",X_train.shape)

Y =  (1199, 2)
X =  (1199, 11)

es = EarlyStopping(monitor='val_loss', patience=5)

Model Architecture¶

# Defining the architecture of our deep learning model

model = Sequential()

model.add(Dense(200, activation = "relu", input_dim = 11))
model.add(Dense(200, activation = "relu"))
model.add(Dense(200, activation = "relu"))
model.add(Dense(200, activation = "relu"))

model.add(Dropout(0.2))

model.add(Dense(150, activation = "relu"))
model.add(Dense(150, activation = "relu"))
model.add(Dense(150, activation = "relu"))


model.add(Dropout(0.2))

model.add(Dense(100, activation = "relu"))
model.add(Dense(100, activation = "relu"))
model.add(Dense(100, activation = "relu"))

model.add(Dense(50, activation = "relu"))
model.add(Dense(50, activation = "relu"))

model.add(Dropout(0.2))

model.add(Dense(25, activation = "relu"))
model.add(Dense(2, activation = "sigmoid"))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense (Dense)                (None, 200)               2400      
_________________________________________________________________
dense_1 (Dense)              (None, 200)               40200     
_________________________________________________________________
dense_2 (Dense)              (None, 200)               40200     
_________________________________________________________________
dense_3 (Dense)              (None, 200)               40200     
_________________________________________________________________
dropout (Dropout)            (None, 200)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 150)               30150     
_________________________________________________________________
dense_5 (Dense)              (None, 150)               22650     
_________________________________________________________________
dense_6 (Dense)              (None, 150)               22650     
_________________________________________________________________
dropout_1 (Dropout)          (None, 150)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 100)               15100     
_________________________________________________________________
dense_8 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_9 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_10 (Dense)             (None, 50)                5050      
_________________________________________________________________
dense_11 (Dense)             (None, 50)                2550      
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 25)                1275      
_________________________________________________________________
dense_13 (Dense)             (None, 2)                 52        
=================================================================
Total params: 242,677
Trainable params: 242,677
Non-trainable params: 0
_________________________________________________________________

# Compiling the model
model.compile(optimizer = Adam(lr = 0.001), loss = 'categorical_crossentropy', metrics = ['accuracy'])

# Run the model for a batch size of 35 for 100 epochs
history = model.fit(X_train, 
                    Y_train, 
                    validation_data = (X_test, Y_test),
                    batch_size = 25,
                    epochs = 100,
                    validation_steps = 10,
                    callbacks = [es]
                   )

Epoch 1/100
48/48 [==============================] - 0s 6ms/step - loss: 0.6519 - accuracy: 0.6322 - val_loss: 0.5219 - val_accuracy: 0.7720
Epoch 2/100
48/48 [==============================] - 0s 3ms/step - loss: 0.5515 - accuracy: 0.7273 - val_loss: 0.5190 - val_accuracy: 0.8000
Epoch 3/100
48/48 [==============================] - 0s 3ms/step - loss: 0.5529 - accuracy: 0.7273 - val_loss: 0.5104 - val_accuracy: 0.7560
Epoch 4/100
48/48 [==============================] - 0s 3ms/step - loss: 0.5300 - accuracy: 0.7256 - val_loss: 0.5154 - val_accuracy: 0.7840
Epoch 5/100
48/48 [==============================] - 0s 3ms/step - loss: 0.5161 - accuracy: 0.7373 - val_loss: 0.4959 - val_accuracy: 0.8040
Epoch 6/100
48/48 [==============================] - 0s 2ms/step - loss: 0.5221 - accuracy: 0.7356 - val_loss: 0.5123 - val_accuracy: 0.7680
Epoch 7/100
48/48 [==============================] - 0s 2ms/step - loss: 0.5238 - accuracy: 0.7448 - val_loss: 0.4893 - val_accuracy: 0.8040
Epoch 8/100
48/48 [==============================] - 0s 2ms/step - loss: 0.5059 - accuracy: 0.7490 - val_loss: 0.5261 - val_accuracy: 0.7040
Epoch 9/100
48/48 [==============================] - 0s 2ms/step - loss: 0.5169 - accuracy: 0.7473 - val_loss: 0.4848 - val_accuracy: 0.7880
Epoch 10/100
48/48 [==============================] - 0s 2ms/step - loss: 0.5066 - accuracy: 0.7515 - val_loss: 0.4939 - val_accuracy: 0.7840
Epoch 11/100
48/48 [==============================] - 0s 2ms/step - loss: 0.5001 - accuracy: 0.7531 - val_loss: 0.5084 - val_accuracy: 0.7800
Epoch 12/100
48/48 [==============================] - 0s 2ms/step - loss: 0.5055 - accuracy: 0.7623 - val_loss: 0.5125 - val_accuracy: 0.7960
Epoch 13/100
48/48 [==============================] - 0s 2ms/step - loss: 0.5040 - accuracy: 0.7623 - val_loss: 0.4979 - val_accuracy: 0.7640
Epoch 14/100
48/48 [==============================] - 0s 2ms/step - loss: 0.5077 - accuracy: 0.7565 - val_loss: 0.4885 - val_accuracy: 0.7800

# Function to plot "accuracy vs epoch" graphs and "loss vs epoch" graphs for training and validation data
def plot_metrics(model_name, metric = 'accuracy'):
    if metric == 'loss':
        plt.title("Loss Values")
        plt.plot(model_name.history['loss'], label = 'train')
        plt.plot(model_name.history['val_loss'], label = 'test')
        plt.legend()
        plt.show()
    else:
        plt.title("Accuracy Values")
        plt.plot(model_name.history['accuracy'], label='train') 
        plt.plot(model_name.history['val_accuracy'], label='test') 
        plt.legend()
        plt.show()

plot_metrics(history, 'accuracy')
plot_metrics(history, 'loss')

# Saving our trained model
from tensorflow.keras.models import save_model
if os.path.isfile('best_model.h5') is False:
    model.save('best_wine.h5')
    
model.save('best_wine.h5')

Plotting a confusion matrix for checking the performance of our model¶

Y_pred = np.argmax(model.predict(X_test), axis = 1)
cnf = confusion_matrix(Y_test.argmax(axis = 1), Y_pred)


df_cnf = pd.DataFrame(cnf, range(2), range(2))
sns.set(font_scale = 2)
sns.heatmap(df_cnf, annot = True)
plt.title("Confusion Matrix")
plt.xlabel("True Values")
plt.ylabel("Prediction Values")
plt.show()

!deepCC best_model.h5

[INFO]
Reading [keras model] 'best_model.h5'
[SUCCESS]
Saved 'best_model_deepC/best_model.onnx'
[INFO]
Reading [onnx model] 'best_model_deepC/best_model.onnx'
[INFO]
Model info:
  ir_vesion : 4
  doc       : 
[WARNING]
[ONNX]: terminal (input/output) dense_input's shape is less than 1. Changing it to 1.
[WARNING]
[ONNX]: terminal (input/output) dense_4's shape is less than 1. Changing it to 1.
WARN (GRAPH): found operator node with the same name (dense_4) as io node.
[INFO]
Running DNNC graph sanity check ...
[SUCCESS]
Passed sanity check.
[INFO]
Writing C++ file 'best_model_deepC/best_model.cpp'
[INFO]
deepSea model files are ready in 'best_model_deepC/' 
[RUNNING COMMAND]
g++ -std=c++11 -O3 -fno-rtti -fno-exceptions -I. -I/opt/tljh/user/lib/python3.7/site-packages/deepC-0.13-py3.7-linux-x86_64.egg/deepC/include -isystem /opt/tljh/user/lib/python3.7/site-packages/deepC-0.13-py3.7-linux-x86_64.egg/deepC/packages/eigen-eigen-323c052e1731 "best_model_deepC/best_model.cpp" -D_AITS_MAIN -o "best_model_deepC/best_model.exe"
[RUNNING COMMAND]
size "best_model_deepC/best_model.exe"
   text	   data	    bss	    dec	    hex	filename
 199477	   2984	    760	 203221	  319d5	best_model_deepC/best_model.exe
[SUCCESS]
Saved model as executable "best_model_deepC/best_model.exe"

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	quality
0	7.4	0.70	0.00	1.9	0.076	11.0	34.0	0.9978	3.51	0.56	9.4	5
1	7.8	0.88	0.00	2.6	0.098	25.0	67.0	0.9968	3.20	0.68	9.8	5
2	7.8	0.76	0.04	2.3	0.092	15.0	54.0	0.9970	3.26	0.65	9.8	5
3	11.2	0.28	0.56	1.9	0.075	17.0	60.0	0.9980	3.16	0.58	9.8	6
4	7.4	0.70	0.00	1.9	0.076	11.0	34.0	0.9978	3.51	0.56	9.4	5
5	7.4	0.66	0.00	1.8	0.075	13.0	40.0	0.9978	3.51	0.56	9.4	5
6	7.9	0.60	0.06	1.6	0.069	15.0	59.0	0.9964	3.30	0.46	9.4	5
7	7.3	0.65	0.00	1.2	0.065	15.0	21.0	0.9946	3.39	0.47	10.0	7
8	7.8	0.58	0.02	2.0	0.073	9.0	18.0	0.9968	3.36	0.57	9.5	7
9	7.5	0.50	0.36	6.1	0.071	17.0	102.0	0.9978	3.35	0.80	10.5	5

Model Files
best_wine.h5 keras Model
deepSea Compiled Models
best_wine.exe deepSea Ubuntu