Financial distress prediction¶
Credit: AITS Cainvas Community
Photo by Shunya Koide on Dribbble
Predicting whether a given company is under financial distress or not based on time based data for different companies.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from tensorflow.keras import optimizers, layers, models, losses, callbacks
The dataset¶
The daatset is a CSV file with finanacial distress prediction for a set of companies.
Along with companies and time periods, there are 83 factors denoted by x1 to x83 that are financial and non-finanacial characterisitcs of the companies. Out of these, x80 is a categorical feature. The 'Financial Distress' column is a continuous variable that can be converted into a two-value column - healthy (0) if value > -0.5, else distressed (1)
df = pd.read_csv('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/Financial_Distress.csv')
df
# Understanding the data
# There are many time periods for each company.
print(pd.crosstab(df['Company'], df['Time'].sum()))
There are companies with less than 5 time periods too!
df80 = pd.get_dummies(df['x80'], drop_first = True, prefix = '80')
for column in df80.columns:
df[column] = df80[column]
df = df.drop(columns = ['x80'])
df
Creating time-based dataframe¶
Since this is a time based dataset, the features are appended to include values for previous timesteps of the same company group.
# Defining the time window, that is, how many timesteps to include
time_window = 5
# Dataframes that hold rows grouped by company
df_company_grouped = df.groupby('Company')
# Column values affected by time - all except Company, Time, Financial Distress and x80 (the categorical variable that was one-hot encoded)
time_affected_columns = [c for c in df.columns if c[0] == 'x'] # Starts with x
# Final dataframe
df_final = pd.DataFrame()
# For each company
for company in df_company_grouped:
# If the comapny has timesteps greater than or equal to the time window, else discard
if time_window <= len(company[1]):
# Skipping time_window-1 rows from the beginning, and looping to till the end
for row_num in range(time_window, len(company[1])+1):
# picking the time_window th row
df_temp = company[1].iloc[row_num-1, :]
# Appending values from time_window-1 rows before that
for i in range(time_window-1):
df_temp_i = company[1].iloc[row_num-1-i][time_affected_columns] # Pick necessary columns
df_temp = pd.concat([df_temp, df_temp_i], axis = 0) # Append values
df_temp = df_temp.to_frame().transpose() # Series to DataFrame
df_final = pd.concat([df_final, df_temp]) # Add as row to final dataframe
# Reset index
df_final = df_final.reset_index(drop = True)
One hot encoding the target variables¶
This is done based on the condition - healthy (0) if value > -0.5, else distressed (1)
df_final['Financial Distress'] = (df_final['Financial Distress']<-0.5).astype('int')
df_final['Financial Distress'].value_counts()
Balancing the dataset¶
With 5 timestep values in one row sample, resampling and training using this dataset without a time series split would be acceptable.
# separating into 2 dataframes, one for each class
df0 = df_final[df_final['Financial Distress'] == 0]
df1 = df_final[df_final['Financial Distress'] == 1]
print("Number of samples in:")
print("Class label 0 - ", len(df0))
print("Class label 1 - ", len(df1))
# Upsampling
df1 = df1.sample(1000, replace = True) # replace = True enables resampling
print('\nAfter resampling - ')
print("Number of samples in:")
print("Class label 0 - ", len(df0))
print("Class label 1 - ", len(df1))
df = df1.append(df0)
df['Financial Distress'].value_counts()
Defining the input and output columns¶
# defining the input and output columns to separate the dataset in the later cells.
input_columns = list(df.columns)
input_columns.remove('Financial Distress')
output_columns = ['Financial Distress']
print("Number of input columns: ", len(input_columns))
#print("Input columns: ", ', '.join(input_columns))
print("Number of output columns: ", len(output_columns))
#print("Output columns: ", ', '.join(output_columns))
Train-val-test split¶
# Splitting into train, val and test set -- 80-10-10 split
# First, an 80-20 split
train_df, val_test_df = train_test_split(df, test_size = 0.2)
# Then split the 20% into half
val_df, test_df = train_test_split(val_test_df, test_size = 0.5)
print("Number of samples in...")
print("Training set: ", len(train_df))
print("Validation set: ", len(val_df))
print("Testing set: ", len(test_df))
# Splitting into X (input) and y (output)
Xtrain, ytrain = np.array(train_df[input_columns]), np.array(train_df[output_columns])
Xval, yval = np.array(val_df[input_columns]), np.array(val_df[output_columns])
Xtest, ytest = np.array(test_df[input_columns]), np.array(test_df[output_columns])
Scaling the values¶
The values in the feature columns are not of the same range.
# Each feature has a different range.
# Using min_max_scaler to scale them to values in the range [0,1].
min_max_scaler = MinMaxScaler()
# Fit on training set alone
Xtrain = min_max_scaler.fit_transform(Xtrain)
# Use it to transform val and test input
Xval = min_max_scaler.transform(Xval)
Xtest = min_max_scaler.transform(Xtest)
The model¶
model = models.Sequential([
layers.Dense(32, activation = 'relu', input_shape = Xtrain[0].shape),
layers.Dense(16, activation = 'relu'),
layers.Dense(8, activation = 'relu'),
layers.Dense(1, activation = 'sigmoid')
])
cb = [callbacks.EarlyStopping(patience = 5, restore_best_weights = True)]
model.summary()
model.compile(optimizer = optimizers.Adam(0.001), loss = losses.BinaryCrossentropy(), metrics = ['accuracy'])
history = model.fit(Xtrain, ytrain, validation_data = (Xval, yval), epochs = 256, callbacks = cb)
model.compile(optimizer = optimizers.Adam(0.0001), loss = losses.BinaryCrossentropy(), metrics = ['accuracy'])
history1 = model.fit(Xtrain, ytrain, validation_data = (Xval, yval), epochs = 256, callbacks = cb)
model.evaluate(Xtest, ytest)
ypred = (model.predict(Xtest)>0.5).astype('int')
cm = confusion_matrix(ytest, ypred)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
fig = plt.figure(figsize = (4, 4))
ax = fig.add_subplot(111)
for i in range(cm.shape[1]):
for j in range(cm.shape[0]):
ax.text(j, i, format(cm[i, j], '.2f'), horizontalalignment="center", color="black")
_ = ax.imshow(cm, cmap=plt.cm.Blues)
ax.set_xticks(range(2))
ax.set_yticks(range(2))
ax.set_xticklabels(range(2))
ax.set_yticklabels(range(2))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
Plotting the metrics¶
def plot(history1, history2, variable1, variable2):
# combining metrics from both trainings
var1_history = history1[variable1]
var1_history.extend(history2[variable1])
var2_history = history1[variable2]
var2_history.extend(history2[variable2])
# plotting them
plt.plot(range(len(var1_history)), var1_history)
plt.plot(range(len(var2_history)), var2_history)
plt.legend([variable1, variable2])
plt.title(variable1)
plot(history.history, history1.history, "accuracy", 'val_accuracy')
plot(history.history, history1.history, "loss", 'val_loss')
Prediction¶
# pick random test data sample from one batch
x = random.randint(0, len(Xtest) - 1)
output = model.predict(Xtest[x].reshape(1, -1))[0][0]
pred = (output>0.5).astype('int')
print("Predicted: ", pred, "(", output, "-->", pred, ")")
print("True: ", ytest[x][0])
deepC¶
model.save('financial_distress_prediction.h5')
!deepCC financial_distress_prediction.h5
# pick random test data sample from one batch
x = random.randint(0, len(Xtest) - 1)
np.savetxt('sample.data', Xtest[x]) # xth sample into text file
# run exe with input
!financial_distress_prediction_deepC/financial_distress_prediction.exe sample.data
# show predicted output
nn_out = np.loadtxt('deepSea_result_1.out')
pred = (nn_out>0.5).astype('int')
print("Predicted: ", pred, "(", nn_out, "-->", pred, ")")
print("True: ", ytest[x][0])