In [1]:
# Import all the necessary libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Sequential
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
Unzip the Dataset¶
In [2]:
!wget 'https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/Wine_dataset.zip'
!unzip -qo Wine_dataset.zip
!rm Wine_dataset.zip
In [3]:
#Loading the data file using pandas library
data = pd.read_csv('winequality-red.csv', sep = ",")
data.head(10)
Out[3]:
Checking for NULL values¶
In [4]:
data.isna().sum()
Out[4]:
Data Visualization¶
In [5]:
# Checking for quality distribution in the dataset
sns.countplot(data = data, x = 'quality')
plt.title("Quality Distribution")
plt.xlabel("Quality Level")
plt.ylabel("Count")
Out[5]:
Since the quality is distribution is not ideal and several quality levels (5 & 6) being highly over represented in the data, let us pre-process this and make this data a two class problem with 1 class containing quality level { 3, 4, 5 } and the other class cotaining quality levels { 6, 7, 8 }
Class 0: { 3, 4, 5 } Class 1: { 6, 7, 8 }
In [6]:
# Creating a new quality- level column
data['quality_level'] = data['quality'].apply(lambda x: 1 if x > 5 else 0)
X = data.drop(columns=['quality', 'quality_level'], axis=1)
y = data['quality_level'].values
In [7]:
sns.countplot(data = data, x = 'quality_level')
plt.title("Quality Distribution")
plt.xlabel("Quality Level")
plt.ylabel("Count")
Out[7]:
After checking the graph above, we conclude that the data is evenly distributed now and the quality classification will be more accurate now.
Effect of alcohol on wine quality¶
In [8]:
# Effect of alcohol level on quality of wine
sns.lineplot(data = data, x = 'quality', y = 'alcohol')
Out[8]:
Plotting Pair Plots¶
In [9]:
# Visualising the relationship between different columns of the data
sns.pairplot(data)
plt.show()