Financial distress prediction¶

Predicting whether a given company is under financial distress or not based on time based data for different companies.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from tensorflow.keras import optimizers, layers, models, losses, callbacks

The dataset¶

On Kaggle on Ebrahimi

The daatset is a CSV file with finanacial distress prediction for a set of companies.

Along with companies and time periods, there are 83 factors denoted by x1 to x83 that are financial and non-finanacial characterisitcs of the companies. Out of these, x80 is a categorical feature. The 'Financial Distress' column is a continuous variable that can be converted into a two-value column - healthy (0) if value > -0.5, else distressed (1)

df = pd.read_csv('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/Financial_Distress.csv')
df

# Understanding the data

# There are many time periods for each company.

print(pd.crosstab(df['Company'], df['Time'].sum()))

col_0    27644
Company       
1            4
2           14
3            1
4           14
5           14
...        ...
418          2
419          3
420          3
421          6
422          8

[422 rows x 1 columns]

There are companies with less than 5 time periods too!

Preprocessing¶

One hot encoding the input variables¶

df80 = pd.get_dummies(df['x80'], drop_first = True, prefix = '80')

for column in df80.columns:
    df[column] = df80[column]
    
df = df.drop(columns = ['x80'])

df

Creating time-based dataframe¶

Since this is a time based dataset, the features are appended to include values for previous timesteps of the same company group.

# Defining the time window, that is, how many timesteps to include
time_window = 5

# Dataframes that hold rows grouped by company
df_company_grouped = df.groupby('Company')

# Column values affected by time - all except Company, Time, Financial Distress and x80 (the categorical variable that was one-hot encoded)
time_affected_columns = [c for c in df.columns if c[0] == 'x']    # Starts with x

# Final dataframe
df_final = pd.DataFrame()

# For each company
for company in df_company_grouped:
    # If the comapny has timesteps greater than or equal to the time window, else discard
    if time_window <= len(company[1]):
        # Skipping time_window-1 rows from the beginning, and looping to till the end 
        for row_num in range(time_window, len(company[1])+1):
            # picking the time_window th row
            df_temp = company[1].iloc[row_num-1, :]
            # Appending values from time_window-1 rows before that
            for i in range(time_window-1):                
                df_temp_i = company[1].iloc[row_num-1-i][time_affected_columns]    # Pick necessary columns                
                df_temp = pd.concat([df_temp, df_temp_i], axis = 0)    # Append values
        
            df_temp = df_temp.to_frame().transpose()    # Series to DataFrame

            df_final = pd.concat([df_final, df_temp])    # Add as row to final dataframe

# Reset index            
df_final = df_final.reset_index(drop = True)

One hot encoding the target variables¶

This is done based on the condition - healthy (0) if value > -0.5, else distressed (1)

df_final['Financial Distress'] = (df_final['Financial Distress']<-0.5).astype('int')

df_final['Financial Distress'].value_counts()

0    2091
1      83
Name: Financial Distress, dtype: int64

Balancing the dataset¶

With 5 timestep values in one row sample, resampling and training using this dataset without a time series split would be acceptable.

# separating into 2 dataframes, one for each class 

df0 = df_final[df_final['Financial Distress'] == 0]
df1 = df_final[df_final['Financial Distress'] == 1]

print("Number of samples in:")
print("Class label 0 - ", len(df0))
print("Class label 1 - ", len(df1))

# Upsampling 

df1 = df1.sample(1000, replace = True)    # replace = True enables resampling

print('\nAfter resampling - ')

print("Number of samples in:")
print("Class label 0 - ", len(df0))
print("Class label 1 - ", len(df1))

df = df1.append(df0)

Number of samples in:
Class label 0 -  2091
Class label 1 -  83

After resampling - 
Number of samples in:
Class label 0 -  2091
Class label 1 -  1000

df['Financial Distress'].value_counts()

0    2091
1    1000
Name: Financial Distress, dtype: int64

Defining the input and output columns¶

# defining the input and output columns to separate the dataset in the later cells.

input_columns = list(df.columns)
input_columns.remove('Financial Distress')
output_columns = ['Financial Distress']

print("Number of input columns: ", len(input_columns))
#print("Input columns: ", ', '.join(input_columns))

print("Number of output columns: ", len(output_columns))
#print("Output columns: ", ', '.join(output_columns))

Number of input columns:  448
Number of output columns:  1

Train-val-test split¶

# Splitting into train, val and test set -- 80-10-10 split

# First, an 80-20 split
train_df, val_test_df = train_test_split(df, test_size = 0.2)

# Then split the 20% into half
val_df, test_df = train_test_split(val_test_df, test_size = 0.5)

print("Number of samples in...")
print("Training set: ", len(train_df))
print("Validation set: ", len(val_df))
print("Testing set: ", len(test_df))

Number of samples in...
Training set:  2472
Validation set:  309
Testing set:  310

# Splitting into X (input) and y (output)

Xtrain, ytrain = np.array(train_df[input_columns]), np.array(train_df[output_columns])

Xval, yval = np.array(val_df[input_columns]), np.array(val_df[output_columns])

Xtest, ytest = np.array(test_df[input_columns]), np.array(test_df[output_columns])

Scaling the values¶

The values in the feature columns are not of the same range.

# Each feature has a different range. 
# Using min_max_scaler to scale them to values in the range [0,1].

min_max_scaler = MinMaxScaler()

# Fit on training set alone
Xtrain = min_max_scaler.fit_transform(Xtrain)

# Use it to transform val and test input
Xval = min_max_scaler.transform(Xval)
Xtest = min_max_scaler.transform(Xtest)

The model¶

model = models.Sequential([
    layers.Dense(32, activation = 'relu', input_shape = Xtrain[0].shape),
    layers.Dense(16, activation = 'relu'),
    layers.Dense(8, activation = 'relu'),
    layers.Dense(1, activation = 'sigmoid')
])

cb = [callbacks.EarlyStopping(patience = 5, restore_best_weights = True)]
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense (Dense)                (None, 32)                66848     
_________________________________________________________________
dense_1 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 136       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 9         
=================================================================
Total params: 67,521
Trainable params: 67,521
Non-trainable params: 0
_________________________________________________________________

model.compile(optimizer = optimizers.Adam(0.001), loss = losses.BinaryCrossentropy(), metrics = ['accuracy'])

history = model.fit(Xtrain, ytrain, validation_data = (Xval, yval), epochs = 256, callbacks = cb)

Epoch 1/256
78/78 [==============================] - 0s 3ms/step - loss: 0.4660 - accuracy: 0.7419 - val_loss: 0.3624 - val_accuracy: 0.8803
Epoch 2/256
78/78 [==============================] - 0s 2ms/step - loss: 0.2995 - accuracy: 0.8782 - val_loss: 0.3781 - val_accuracy: 0.8091
Epoch 3/256
78/78 [==============================] - 0s 2ms/step - loss: 0.3118 - accuracy: 0.8523 - val_loss: 0.2431 - val_accuracy: 0.9094
Epoch 4/256
78/78 [==============================] - 0s 2ms/step - loss: 0.2493 - accuracy: 0.8968 - val_loss: 0.2366 - val_accuracy: 0.9126
Epoch 5/256
78/78 [==============================] - 0s 2ms/step - loss: 0.2279 - accuracy: 0.9041 - val_loss: 0.2305 - val_accuracy: 0.9094
Epoch 6/256
78/78 [==============================] - 0s 2ms/step - loss: 0.2336 - accuracy: 0.8952 - val_loss: 0.2547 - val_accuracy: 0.8900
Epoch 7/256
78/78 [==============================] - 0s 2ms/step - loss: 0.2095 - accuracy: 0.9086 - val_loss: 0.3058 - val_accuracy: 0.8738
Epoch 8/256
78/78 [==============================] - 0s 2ms/step - loss: 0.2476 - accuracy: 0.8896 - val_loss: 0.2515 - val_accuracy: 0.8900
Epoch 9/256
78/78 [==============================] - 0s 2ms/step - loss: 0.2178 - accuracy: 0.9070 - val_loss: 0.2555 - val_accuracy: 0.8835
Epoch 10/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1993 - accuracy: 0.9118 - val_loss: 0.2057 - val_accuracy: 0.9191
Epoch 11/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1939 - accuracy: 0.9106 - val_loss: 0.2117 - val_accuracy: 0.9191
Epoch 12/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1899 - accuracy: 0.9159 - val_loss: 0.2029 - val_accuracy: 0.9159
Epoch 13/256
78/78 [==============================] - 0s 2ms/step - loss: 0.2271 - accuracy: 0.8997 - val_loss: 0.2339 - val_accuracy: 0.9029
Epoch 14/256
78/78 [==============================] - 0s 2ms/step - loss: 0.2026 - accuracy: 0.9126 - val_loss: 0.2184 - val_accuracy: 0.9061
Epoch 15/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1859 - accuracy: 0.9167 - val_loss: 0.2213 - val_accuracy: 0.9126
Epoch 16/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1778 - accuracy: 0.9219 - val_loss: 0.2551 - val_accuracy: 0.8997
Epoch 17/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1712 - accuracy: 0.9284 - val_loss: 0.2389 - val_accuracy: 0.8835

model.compile(optimizer = optimizers.Adam(0.0001), loss = losses.BinaryCrossentropy(), metrics = ['accuracy'])

history1 = model.fit(Xtrain, ytrain, validation_data = (Xval, yval), epochs = 256, callbacks = cb)

Epoch 1/256
78/78 [==============================] - 0s 3ms/step - loss: 0.1668 - accuracy: 0.9268 - val_loss: 0.1990 - val_accuracy: 0.9159
Epoch 2/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1634 - accuracy: 0.9284 - val_loss: 0.1878 - val_accuracy: 0.9288
Epoch 3/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1614 - accuracy: 0.9300 - val_loss: 0.1862 - val_accuracy: 0.9288
Epoch 4/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1604 - accuracy: 0.9357 - val_loss: 0.1899 - val_accuracy: 0.9159
Epoch 5/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1573 - accuracy: 0.9353 - val_loss: 0.1841 - val_accuracy: 0.9288
Epoch 6/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1586 - accuracy: 0.9373 - val_loss: 0.1847 - val_accuracy: 0.9353
Epoch 7/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1554 - accuracy: 0.9357 - val_loss: 0.1847 - val_accuracy: 0.9256
Epoch 8/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1531 - accuracy: 0.9385 - val_loss: 0.1802 - val_accuracy: 0.9353
Epoch 9/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1522 - accuracy: 0.9426 - val_loss: 0.1865 - val_accuracy: 0.9353
Epoch 10/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1512 - accuracy: 0.9377 - val_loss: 0.1790 - val_accuracy: 0.9288
Epoch 11/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1496 - accuracy: 0.9397 - val_loss: 0.1800 - val_accuracy: 0.9353
Epoch 12/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1497 - accuracy: 0.9405 - val_loss: 0.1780 - val_accuracy: 0.9353
Epoch 13/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1480 - accuracy: 0.9417 - val_loss: 0.1787 - val_accuracy: 0.9353
Epoch 14/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1466 - accuracy: 0.9413 - val_loss: 0.1760 - val_accuracy: 0.9353
Epoch 15/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1466 - accuracy: 0.9393 - val_loss: 0.1746 - val_accuracy: 0.9288
Epoch 16/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1444 - accuracy: 0.9417 - val_loss: 0.1726 - val_accuracy: 0.9353
Epoch 17/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1434 - accuracy: 0.9430 - val_loss: 0.1750 - val_accuracy: 0.9385
Epoch 18/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1428 - accuracy: 0.9422 - val_loss: 0.1720 - val_accuracy: 0.9288
Epoch 19/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1416 - accuracy: 0.9442 - val_loss: 0.1709 - val_accuracy: 0.9385
Epoch 20/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1402 - accuracy: 0.9462 - val_loss: 0.1718 - val_accuracy: 0.9353
Epoch 21/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1402 - accuracy: 0.9438 - val_loss: 0.1712 - val_accuracy: 0.9320
Epoch 22/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1390 - accuracy: 0.9462 - val_loss: 0.1716 - val_accuracy: 0.9385
Epoch 23/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1386 - accuracy: 0.9474 - val_loss: 0.1696 - val_accuracy: 0.9385
Epoch 24/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1352 - accuracy: 0.9490 - val_loss: 0.1704 - val_accuracy: 0.9385
Epoch 25/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1351 - accuracy: 0.9466 - val_loss: 0.1801 - val_accuracy: 0.9385
Epoch 26/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1349 - accuracy: 0.9506 - val_loss: 0.1681 - val_accuracy: 0.9385
Epoch 27/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1324 - accuracy: 0.9527 - val_loss: 0.1731 - val_accuracy: 0.9385
Epoch 28/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1378 - accuracy: 0.9434 - val_loss: 0.1785 - val_accuracy: 0.9126
Epoch 29/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1358 - accuracy: 0.9442 - val_loss: 0.1724 - val_accuracy: 0.9256
Epoch 30/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1306 - accuracy: 0.9494 - val_loss: 0.1689 - val_accuracy: 0.9353
Epoch 31/256
78/78 [==============================] - 0s 2ms/step - loss: 0.1353 - accuracy: 0.9450 - val_loss: 0.1710 - val_accuracy: 0.9353

model.evaluate(Xtest, ytest)

 1/10 [==>...........................] - ETA: 0s - loss: 0.2450 - accuracy: 0.8750

ypred = (model.predict(Xtest)>0.5).astype('int')

cm = confusion_matrix(ytest, ypred)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

fig = plt.figure(figsize = (4, 4))
ax = fig.add_subplot(111)

for i in range(cm.shape[1]):
    for j in range(cm.shape[0]):   
        ax.text(j, i, format(cm[i, j], '.2f'), horizontalalignment="center", color="black")

_ = ax.imshow(cm, cmap=plt.cm.Blues)
ax.set_xticks(range(2))
ax.set_yticks(range(2))
ax.set_xticklabels(range(2))
ax.set_yticklabels(range(2))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

Plotting the metrics¶

def plot(history1, history2, variable1, variable2):
    # combining metrics from both trainings    
    var1_history = history1[variable1]
    var1_history.extend(history2[variable1])
    
    var2_history = history1[variable2]
    var2_history.extend(history2[variable2])
    
    # plotting them
    plt.plot(range(len(var1_history)), var1_history)
    plt.plot(range(len(var2_history)), var2_history)
    plt.legend([variable1, variable2])
    plt.title(variable1)

plot(history.history, history1.history, "accuracy", 'val_accuracy')

plot(history.history, history1.history, "loss", 'val_loss')

Prediction¶

# pick random test data sample from one batch
x = random.randint(0, len(Xtest) - 1)

output = model.predict(Xtest[x].reshape(1, -1))[0][0] 
pred = (output>0.5).astype('int')
print("Predicted: ", pred, "(", output, "-->", pred, ")")    

print("True: ", ytest[x][0])

deepC¶

model.save('financial_distress_prediction.h5')

!deepCC financial_distress_prediction.h5

# pick random test data sample from one batch
x = random.randint(0, len(Xtest) - 1)

np.savetxt('sample.data', Xtest[x])    # xth sample into text file

# run exe with input
!financial_distress_prediction_deepC/financial_distress_prediction.exe sample.data

# show predicted output
nn_out = np.loadtxt('deepSea_result_1.out')

pred = (nn_out>0.5).astype('int')
print("Predicted: ", pred, "(", nn_out, "-->", pred, ")")    

print("True: ", ytest[x][0])

	Company	Time	Financial Distress	x1	x2	x3	x4	x5	x6	x7	...	x74	x75	x76	x77	x78	x79	x80	x81	x82	x83
0	1	1	0.010636	1.2810	0.022934	0.87454	1.21640	0.060940	0.188270	0.52510	...	85.437	27.07	26.102	16.000	16.0	0.2	22	0.060390	30	49
1	1	2	-0.455970	1.2700	0.006454	0.82067	1.00490	-0.014080	0.181040	0.62288	...	107.090	31.31	30.194	17.000	16.0	0.4	22	0.010636	31	50
2	1	3	-0.325390	1.0529	-0.059379	0.92242	0.72926	0.020476	0.044865	0.43292	...	120.870	36.07	35.273	17.000	15.0	-0.2	22	-0.455970	32	51
3	1	4	-0.566570	1.1131	-0.015229	0.85888	0.80974	0.076037	0.091033	0.67546	...	54.806	39.80	38.377	17.167	16.0	5.6	22	-0.325390	33	52
4	2	1	1.357300	1.0623	0.107020	0.81460	0.83593	0.199960	0.047800	0.74200	...	85.437	27.07	26.102	16.000	16.0	0.2	29	1.251000	7	27
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
3667	422	10	0.438020	2.2605	0.202890	0.16037	0.18588	0.175970	0.198400	2.22360	...	100.000	100.00	100.000	17.125	14.5	-7.0	37	0.436380	4	41
3668	422	11	0.482410	1.9615	0.216440	0.20095	0.21642	0.203590	0.189870	1.93820	...	91.500	130.50	132.400	20.000	14.5	-16.0	37	0.438020	5	42
3669	422	12	0.500770	1.7099	0.207970	0.26136	0.21399	0.193670	0.183890	1.68980	...	87.100	175.90	178.100	20.000	14.5	-20.2	37	0.482410	6	43
3670	422	13	0.611030	1.5590	0.185450	0.30728	0.19307	0.172140	0.170680	1.53890	...	92.900	203.20	204.500	22.000	22.0	6.4	37	0.500770	7	44
3671	422	14	0.518650	1.6148	0.176760	0.36369	0.18442	0.169550	0.197860	1.58420	...	91.700	227.50	214.500	21.000	20.5	8.6	37	0.611030	8	45

	Company	Time	Financial Distress	x1	x2	x3	x4	x5	x6	x7	...	80_28	80_29	80_30	80_31	80_32	80_33	80_34	80_35	80_36	80_37
0	1	1	0.010636	1.2810	0.022934	0.87454	1.21640	0.060940	0.188270	0.52510	...	0	0	0	0	0	0	0	0	0	0
1	1	2	-0.455970	1.2700	0.006454	0.82067	1.00490	-0.014080	0.181040	0.62288	...	0	0	0	0	0	0	0	0	0	0
2	1	3	-0.325390	1.0529	-0.059379	0.92242	0.72926	0.020476	0.044865	0.43292	...	0	0	0	0	0	0	0	0	0	0
3	1	4	-0.566570	1.1131	-0.015229	0.85888	0.80974	0.076037	0.091033	0.67546	...	0	0	0	0	0	0	0	0	0	0
4	2	1	1.357300	1.0623	0.107020	0.81460	0.83593	0.199960	0.047800	0.74200	...	0	1	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
3667	422	10	0.438020	2.2605	0.202890	0.16037	0.18588	0.175970	0.198400	2.22360	...	0	0	0	0	0	0	0	0	0	1
3668	422	11	0.482410	1.9615	0.216440	0.20095	0.21642	0.203590	0.189870	1.93820	...	0	0	0	0	0	0	0	0	0	1
3669	422	12	0.500770	1.7099	0.207970	0.26136	0.21399	0.193670	0.183890	1.68980	...	0	0	0	0	0	0	0	0	0	1
3670	422	13	0.611030	1.5590	0.185450	0.30728	0.19307	0.172140	0.170680	1.53890	...	0	0	0	0	0	0	0	0	0	1
3671	422	14	0.518650	1.6148	0.176760	0.36369	0.18442	0.169550	0.197860	1.58420	...	0	0	0	0	0	0	0	0	0	1

Model Files
financial_distress_prediction… keras Model
deepSea Compiled Models
financial_distress_prediction… deepSea Ubuntu