Classifying asteroids as hazardous or non-hazardous based on various factors such as speed, distance, mass etc.
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import confusion_matrix
import random
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
The dataset¶
The datatset is a CSV file with attributes of asteroids such as distance from earth, velocity, orbit overlap etc that help us identify hazardous from non-hazardous ones.
df = pd.read_csv('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/nasa.csv')
df
Data preprocessing¶
Data cleaning/Fetaure extraction¶
# Peek into the columns
df.columns
# Understanding the datatypes of the columns
df.dtypes
# What are the contents of the Equinox column?
df['Equinox'].value_counts()
# Drop unnecessary columns
df = df.drop(columns = ['Neo Reference ID', 'Name', 'Equinox', 'Close Approach Date', 'Orbit Determination Date', 'Orbiting Body', 'Epoch Date Close Approach'])
df
# Lets look at the correlation between the various attributes
corr = df.corr()
corr
The same measure of distance and velocities are given in different units of measurement thus making them duplicates.
# There are many column pairs with high correlation, thus acting as duplicates.
# We can drop a column if there is a correlation > 0.95
final_columns = list(df.columns) # maintaining a temporary list to remove columns from
for i in range(corr.shape[0]):
for j in range(i+1, corr.shape[0]):
if corr.iloc[i, j] >= 0.95: # if very high correlation
if list(df.columns)[j] in final_columns: # if not aldready removed
final_columns.remove(list(df.columns)[j]) # remove
df = df[final_columns] # selecting only the required columns
df
The number of columns have reduced a lot!
Converting to integer categorical variable¶
# Changin to int (0/1) from bool (False/True)
df['Hazardous'] = df['Hazardous'].astype('int')
Balancing the dataset¶
# Spread of values in categories
df['Hazardous'].value_counts()
There is high imbalance.
Here, we resample records from the category with fewer ones to match the higher category count.
# Spltting into one dataframe for each category.
df_h = df[df['Hazardous'] == 1]
df_nh = df[df['Hazardous'] == 0]
# Resampling
df_h = df_h.sample(len(df_nh), replace = True)
# Concatenating the two to form a single dataset
df = df_h.append(df_nh)
# Spread of values in categories
df['Hazardous'].value_counts()
Train test split¶
# defining the input and output columns to separate the dataset in the later cells.
input_columns = list(df.columns[:-1])
output_columns = [df.columns[-1]]
print("Number of input columns: ", len(input_columns))
#print("Input columns: ", ', '.join(input_columns))
print("Number of output columns: ", len(output_columns))
#print("Output columns: ", ', '.join(output_columns))
# Splitting into train, val and test set -- 80-10-10 split
# First, an 80-20 split
train_df, val_test_df = train_test_split(df, test_size = 0.2, random_state = 113)
# Then split the 20% into half
val_df, test_df = train_test_split(val_test_df, test_size = 0.5, random_state = 113)
print("Number of samples in...")
print("Training set: ", len(train_df))
print("Validation set: ", len(val_df))
print("Testing set: ", len(test_df))
# Splitting into X (input) and y (output)
Xtrain, ytrain = np.array(train_df[input_columns]), np.array(train_df[output_columns])
Xval, yval = np.array(val_df[input_columns]), np.array(val_df[output_columns])
Xtest, ytest = np.array(test_df[input_columns]), np.array(test_df[output_columns])
Scaling¶
# Each feature has a different range.
# Using min_max_scaler to scale them to values in the range [0,1].
min_max_scaler = MinMaxScaler()
# Fit on training set alone
Xtrain = min_max_scaler.fit_transform(Xtrain)
# Use it to transform val and test input
Xval = min_max_scaler.transform(Xval)
Xtest = min_max_scaler.transform(Xtest)
The model¶
model = Sequential([
Dense(256, activation = 'relu', input_shape = Xtrain[0].shape),
Dense(128, activation = 'relu'),
Dense(32, activation='relu'),
Dense(1, activation = 'sigmoid')
])
cb = [EarlyStopping(monitor = 'val_loss', patience = 5, restore_best_weights = True)]
model.summary()
model.compile(optimizer = Adam(0.001), loss = BinaryCrossentropy(), metrics=['accuracy'])
history = model.fit(Xtrain, ytrain, validation_data = (Xval, yval), epochs = 256, callbacks = cb)
model.evaluate(Xtest, ytest)
cm = confusion_matrix(ytest, (model.predict(Xtest)>0.5).astype('int'))
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
for i in range(cm.shape[1]):
for j in range(cm.shape[0]):
plt.text(j, i, format(cm[i, j], '.2f'), horizontalalignment="center", color="black")
plt.imshow(cm, cmap=plt.cm.Blues)
It is important to avoid false negatives (cell [1, 0] in matrix) in this type of problems. False negative is the classifiction of a hazardous asteroid as non-hazardous. A larger training set will help.
Plotting the metrics¶
def plot(history, variable, variable1):
plt.plot(range(len(history[variable])), history[variable])
plt.plot(range(len(history[variable1])), history[variable1])
plt.title(variable)
plt.legend([variable, variable1])
plot(history.history, "accuracy", "val_accuracy")
plot(history.history, "loss", "val_loss")
Prediction¶
# Defining output labels in accoudance with their array indices.
output_labels = ['Non-hazardous', 'Hazardous']
# Pick random test sample
i = random.randint(0, len(test_df)-1)
model_output = model.predict(Xtest[i].reshape(1, -1))[0][0]
pred = (model_output>0.5).astype('int')
# show predicted output
print ("\nModel predicted : ", output_labels[pred], "(", model_output, "-->", pred, ")")
# actual output
print("Actual asteroid type : ", output_labels[ytest[i][0]])
deepC¶
model.save('asteroid.h5')
!deepCC asteroid.h5
# Pick random test sample
i = random.randint(0, len(test_df)-1)
np.savetxt('sample.data', Xtest[i])
# run exe with input
!asteroid_deepC/asteroid.exe sample.data
# show predicted output
nn_out = np.loadtxt('deepSea_result_1.out')
pred = (nn_out>0.5).astype('int')
print ("\nModel predicted : ", output_labels[pred], "(", nn_out, "-->", pred, ")")
# actual output
print("Actual asteroid type : ", output_labels[ytest[i][0]])