Abalone age prediction app¶
Credit: AITS Cainvas Community
Photo by Nico Medina on Dribbble
Abalone is a common name for sea snails. Determining their age is a detailed process. Their shell is cut through the cone, stained and the rings are counted using a microscope.
Here, we use measurements such as length, height, weight and other features to predict their age.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import models, optimizers, losses, layers, callbacks
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import random
The dataset¶
Data comes from an original (non-machine-learning) study: Warwick J Nash, Tracy L Sellers, Simon R Talbot, Andrew J Cawthorn and Wes B Ford (1994) "The Population Biology of Abalone (Haliotis species) in Tasmania. I. Blacklip Abalone (H. rubra) from the North Coast and Islands of Bass Strait", Sea Fisheries Division, Technical Report No. 48 (ISSN 1034-3288)
The dataset is a CSV file containing features of 4177 samples.
df = pd.read_csv('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/abalone.csv')
df
# One hot encoding the sex attribute.
df_dummies = pd.get_dummies(df['Sex'], drop_first = True, prefix = "Sex_")
# Inserting dummy columns
for column in df_dummies.columns:
df[column] = df_dummies[column]
# Dropping the original column
df = df.drop(columns = ['Sex'])
df
Encoding the output columns¶
def rings_label(x):
if x<=10:
return 'young'
if x<=20:
return 'middle age'
if x<=30:
return 'old'
df['Rings'] = df['Rings'].apply(rings_label)
df['Rings'].value_counts()
# One hot encoding the sex attribute.
df_dummies = pd.get_dummies(df['Rings'])
# Inserting dummy columns
for column in df_dummies.columns:
df[column] = df_dummies[column]
# Dropping the original column
df = df.drop(columns = ['Rings'])
df
Defining the input and output columns¶
# defining the input and output columns to separate the dataset in the later cells.
input_columns = df.columns.tolist()
input_columns.remove('young')
input_columns.remove('middle age')
input_columns.remove('old')
output_columns = ['young', 'middle age', 'old']
print("Number of input columns: ", len(input_columns))
#print("Input columns: ", ', '.join(input_columns))
print("Number of output columns: ", len(output_columns))
#print("Output columns: ", ', '.join(output_columns))
Train validation test split¶
# Splitting into train, val and test set -- 80-10-10 split
# First, an 80-20 split
train_df, val_test_df = train_test_split(df, test_size = 0.2)
# Then split the 20% into half
val_df, test_df = train_test_split(val_test_df, test_size = 0.5)
print("Number of samples in...")
print("Training set: ", len(train_df))
print("Validation set: ", len(val_df))
print("Testing set: ", len(test_df))
# Splitting into X (input) and y (output)
Xtrain, ytrain = np.array(train_df[input_columns]), np.array(train_df[output_columns])
Xval, yval = np.array(val_df[input_columns]), np.array(val_df[output_columns])
Xtest, ytest = np.array(test_df[input_columns]), np.array(test_df[output_columns])
Standardization¶
ss = StandardScaler()
Xtrain = ss.fit_transform(Xtrain)
Xval = ss.transform(Xval)
Xtest = ss.transform(Xtest)
The model¶
model = models.Sequential([
layers.Dense(32, activation = 'relu', input_shape = Xtrain[0].shape),
layers.Dense(8, activation = 'relu'),
layers.Dense(3, activation = 'softmax')
])
cb = callbacks.EarlyStopping(patience = 5, restore_best_weights = True)
model.summary()
model.compile(optimizer = optimizers.Adam(0.001), loss = losses.CategoricalCrossentropy(), metrics = ['accuracy'])
history = model.fit(Xtrain, ytrain, validation_data = (Xval, yval), epochs = 256, callbacks = cb)
model.evaluate(Xtest, ytest)
cm = confusion_matrix(np.argmax(ytest, axis = 1), (np.argmax(model.predict(Xtest), axis = 1)))
cm = cm.astype('int') / cm.sum(axis=1)[:, np.newaxis]
fig = plt.figure(figsize = (10, 10))
ax = fig.add_subplot(111)
for i in range(cm.shape[1]):
for j in range(cm.shape[0]):
if cm[i,j] > 0.8:
clr = "white"
else:
clr = "black"
ax.text(j, i, format(cm[i, j], '.2f'), horizontalalignment="center", color=clr)
_ = ax.imshow(cm, cmap=plt.cm.Blues)
ax.set_xticks(range(3))
ax.set_yticks(range(3))
ax.set_xticklabels(output_columns, rotation = 90)
ax.set_yticklabels(output_columns)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
Other attributes such as weather patterns and location (hence food availability) can help in classifying them better.
Plotting the metrics¶
def plot(history, variable, variable2):
plt.plot(range(len(history[variable])), history[variable])
plt.plot(range(len(history[variable2])), history[variable2])
plt.legend([variable, variable2])
plt.title(variable)
plot(history.history, "loss", "val_loss")
plot(history.history, "accuracy", "val_accuracy")
Prediction¶
# pick random test data sample from one batch
x = random.randint(0, len(Xtest) - 1)
output = model.predict(Xtest[x].reshape(1, -1))[0]
print("Predicted: ", output_columns[np.argmax(output)])
print("Probability: ", output[np.argmax(output)])
print("True: ", output_columns[np.argmax(ytest[x])])
deepC¶
model.save('abalone.h5')
!deepCC abalone.h5
x = random.randint(0, len(Xtest) - 1)
print(x)
np.savetxt('sample.data', Xtest[x]) # xth sample into text file
# run exe with input
!abalone_deepC/abalone.exe sample.data
# show predicted output
nn_out = np.loadtxt('deepSea_result_1.out')
print(model.predict(Xtest[x].reshape(1, -1))[0])
print(nn_out)
#print(x, Xtest[x])
print("Predicted: ", output_columns[np.argmax(nn_out)])
print("Probability: ", nn_out[np.argmax(nn_out)])
#print(x, Xtest[x])
print("True: ", output_columns[np.argmax(ytest[x])])