Glass quality assessment¶
Credit: AITS Cainvas Community
The quality of a glass refers to its transparency, heat resistance, stability etc. It depends on many features such as thickness, composition, and many others.
Here, we use deep neural networks to categorize glass samples based on its features.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import confusion_matrix
import random
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
The dataset¶
The dataset is a CSV file with 15 features categories for each sample. There are 2 categories of glass samples - 1 and 2.
df = pd.read_csv('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/Glass_train.csv')
df
Preprocessing¶
Removing redundant features¶
Let us look at the correlation between the feature categories -
corr = df.corr()
corr
# There are many column pairs with high correlation, thus acting as duplicates.
# We can drop a column if there is a correlation = 1 or -1
final_columns = list(df.columns) # maintaining a temporary list to remove columns from
for i in range(corr.shape[0]):
for j in range(i+1, corr.shape[0]):
#print(abs(corr.iloc[i,j]), list(df.columns)[i], list(df.columns)[j])
if abs(corr.iloc[i, j]) > 0.98: # if very high correlation
if list(df.columns)[j] in final_columns: # if not aldready removed
final_columns.remove(list(df.columns)[j]) # remove
df = df[final_columns] # selecting only the required columns
df
4 columns have been removed
- grade_A_Component_2 (inverted grade_A_component_1)
- xmax and ymax are very clodely related to xmin and ymin respectively
- log area is directly related to pixel_area
Balancing the dataset¶
df['class'].value_counts()
It is an unbalanced dataset.
In order to balance the dataset, there are two options,
- upsampling - resample the values to make their count equal to the class label with the higher count (here, 887).
- downsampling - pick n samples from each class label where n = number of samples in class with least count (here, 471)
Here, we will be upsampling.
# Spltting into one dataframe for each category.
df_1 = df[df['class'] == 1]
df_2 = df[df['class'] == 2]
# Resampling
df_2 = df_2.sample(len(df_1), replace = True)
# Concatenating the two to form a single dataset
df = df_2.append(df_1)
# Spread of values in categories
df['class'].value_counts()
Renaming the classes¶
Since this is a binary classification problem, we will rename samples belonging to class 2 as class 0, thus making the two categories for classification 0 and 1.
df['class'] = (df['class'] == 1).astype('int')
df
input_columns = df.columns[:-1].to_list()
output_columns = [df.columns[-1]]
input_columns, output_columns
Train val test split¶
# Splitting into train, val and test set -- 80-10-10 split
# First, an 80-20 split
train_df, val_test_df = train_test_split(df, test_size = 0.2, random_state = 3)
# Then split the 20% into half
val_df, test_df = train_test_split(val_test_df, test_size = 0.5, random_state = 3)
print("Number of samples in...")
print("Training set: ", len(train_df))
print("Validation set: ", len(val_df))
print("Testing set: ", len(test_df))
val_df['class'].value_counts(), test_df['class'].value_counts()
# Splitting into X (input) and y (output)
Xtrain, ytrain = np.array(train_df[input_columns]), np.array(train_df[output_columns])
Xval, yval = np.array(val_df[input_columns]), np.array(val_df[output_columns])
Xtest, ytest = np.array(test_df[input_columns]), np.array(test_df[output_columns])
Scaling the values¶
As the feature values are on different scales, we use MinMaxScaler to scale the values to the range [0,1].
# Each feature has a different range.
# Using min_max_scaler to scale them to values in the range [0,1].
min_max_scaler = MinMaxScaler()
# Fit on training set alone
Xtrain = min_max_scaler.fit_transform(Xtrain)
# Use it to transform val and test input
Xval = min_max_scaler.transform(Xval)
Xtest = min_max_scaler.transform(Xtest)
The model¶
model = Sequential([
Dense(128, activation = 'relu', input_shape = Xtrain[0].shape),
Dense(64, activation = 'relu'),
Dense(16, activation='relu'),
Dense(1, activation = 'sigmoid')
])
cb = [EarlyStopping(monitor = 'val_loss', patience = 10, restore_best_weights = True)]
mo
model.compile(optimizer = Adam(0.01), loss = BinaryCrossentropy(), metrics=['accuracy'])
history = model.fit(Xtrain, ytrain, validation_data = (Xval, yval), epochs = 256, callbacks=cb)
model.compile(optimizer = Adam(0.001), loss = BinaryCrossentropy(), metrics=['accuracy'])
history1 = model.fit(Xtrain, ytrain, validation_data = (Xval, yval), epochs = 256, callbacks=cb)
model.evaluate(Xtest, ytest)
cm = confusion_matrix(ytest, (model.predict(Xtest)>0.5).astype('int'))
cm = cm.astype('int') / cm.sum(axis=1)[:, np.newaxis]
for i in range(cm.shape[1]):
for j in range(cm.shape[0]):
plt.text(j, i, format(cm[i, j], '.2f'), horizontalalignment="center", color="black")
plt.imshow(cm, cmap=plt.cm.Blues)
Plotting the metrics¶
def plot(history1, history2, variable1, variable2):
# combining metrics from both trainings
var1_history = history1[variable1]
var1_history.extend(history2[variable1])
var2_history = history1[variable2]
var2_history.extend(history2[variable2])
# plotting them
plt.plot(range(len(var1_history)), var1_history)
plt.plot(range(len(var2_history)), var2_history)
plt.legend([variable1, variable2])
plt.title(variable1)
plot(history.history, history1.history, "accuracy", 'val_accuracy')
plot(history.history, history1.history, "loss", 'val_loss')
Prediction¶
# pick random test data sample from one batch
x = random.randint(0, len(Xtest) - 1)
output = model.predict(Xtest[x].reshape(1, -1))[0]
pred = (output>0.5).astype('int')[0] # finding category
print("Predicted: ", pred, "(", output[0], "-->", pred, ")")
print("True: ", ytest[x][0])
deepC¶
model.save('glass.h5')
!deepCC glass.h5
# pick random test data sample from one batch
x = random.randint(0, len(Xtest) - 1)
np.savetxt('sample.data', Xtest[x]) # xth sample into text file
# run exe with input
!glass_deepC/glass.exe sample.data
# show predicted output
nn_out = np.loadtxt('deepSea_result_1.out')
pred = (nn_out>0.5).astype('int') # finding category
print("\nPredicted: ", pred, "(", nn_out, "-->", pred, ")")
print("True: ", ytest[x][0])