Encroachment Detection System based on anomalies in Network¶
Credit: AITS Cainvas Community
Photo by Evgenia Eiter on Dribbble
- Network Encroachment detection systems (NEDS) are set up at a planned point within the network to examine traffic from all devices on the network. It performs an observation of passing traffic on the entire subnet and matches the traffic that is passed on the subnets to the collection of known attacks. Once an attack is identified or abnormal behavior is observed, the alert can be sent to the administrator
In [ ]:
Setup: Importing neccessary libraries¶
In [1]:
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import imblearn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
# Importing the Keras libraries and packages
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')
Downloading and Unzipping Dataset¶
In [2]:
!wget -N "https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/Encroachment.zip"
!unzip -o "Encroachment.zip"
!rm "Encroachment.zip"
Understanding and visualizing the data¶
In [3]:
train = pd.read_csv("Train_data.csv")
test = pd.read_csv("Test_data.csv")
print(train.head(4))
print("Training data has {} rows & {} columns".format(train.shape[0],train.shape[1]))
print(test.head(4))
print("Testing data has {} rows & {} columns".format(test.shape[0],test.shape[1]))
In [4]:
#Exploratory Analysis
# Descriptive statistics
train.describe()
print(train['num_outbound_cmds'].value_counts())
print(test['num_outbound_cmds'].value_counts())
#'num_outbound_cmds' is a redundant column so remove it from both train & test datasets
train.drop(['num_outbound_cmds'], axis=1, inplace=True)
test.drop(['num_outbound_cmds'], axis=1, inplace=True)
# Attack Class Distribution
train['class'].value_counts()
Out[4]:
In [5]:
train[train['class'] == 'anomaly']
Out[5]:
In [6]:
plt.figure(figsize = (6,5))
sns.countplot(train['class'], color = "orange")
plt.show()
In [7]:
train.hist(figsize=(30,30))
plt.show()
Data Pre-Processing¶
In [8]:
#Scalling numerical attributes
scaler = StandardScaler()
# extract numerical attributes and scale it to have zero mean and unit variance
cols = train.select_dtypes(include=['float64','int64']).columns
sc_train = scaler.fit_transform(train.select_dtypes(include=['float64','int64']))
sc_test = scaler.fit_transform(test.select_dtypes(include=['float64','int64']))
# turn the result back to a dataframe
sc_traindf = pd.DataFrame(sc_train, columns = cols)
sc_testdf = pd.DataFrame(sc_test, columns = cols)
In [9]:
#Encoding categorical attributes
encoder = LabelEncoder()
# extract categorical attributes from both training and test sets
cattrain = train.select_dtypes(include=['object']).copy()
cattest = test.select_dtypes(include=['object']).copy()
# encode the categorical attributes
traincat = cattrain.apply(encoder.fit_transform)
testcat = cattest.apply(encoder.fit_transform)
# separate target column from encoded data
enctrain = traincat.drop(['class'], axis=1)
cat_Ytrain = traincat[['class']].copy()
In [10]:
#Union of processed numerical and categorical data
train_x = pd.concat([sc_traindf,enctrain],axis=1)
train_y = cat_Ytrain
train_x.shape
test_df = pd.concat([sc_testdf,testcat],axis=1)
test_df.shape
Out[10]:
In [11]:
train_y
Out[11]:
Feature Selection¶
In [12]:
#Feature Selection
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier();
# fit random forest classifier on the training set
rfc.fit(train_x, train_y);
# extract important features
score = np.round(rfc.feature_importances_,3)
importances = pd.DataFrame({'feature':train_x.columns,'importance':score})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
# plot importances
plt.rcParams['figure.figsize'] = (11, 4)
importances.plot.bar();
In [13]:
#Recursive feature elimination
from sklearn.feature_selection import RFE
import itertools
rfc = RandomForestClassifier()
# create the RFE model and select 10 attributes
rfe = RFE(rfc, n_features_to_select=15)
rfe = rfe.fit(train_x, train_y)
# summarize the selection of the attributes
feature_map = [(i, v) for i, v in itertools.zip_longest(rfe.get_support(), train_x.columns)]
selected_features = [v for i, v in feature_map if i==True]
selected_features
Out[13]:
In [14]:
a = [i[0] for i in feature_map]
train_x = train_x.iloc[:,a]
test_df = test_df.iloc[:,a]
Network Building¶
In [15]:
#Dataset Partition
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(train_x,train_y,train_size=0.70, random_state=2)
# Initialising the ANN
classifier = Sequential()
# Adding the input layer and the first hidden layer
classifier.add(Dense(units = 8, kernel_initializer = 'uniform', activation = 'relu', input_dim = 15))
# Adding the second hidden layer
classifier.add(Dense(units = 8, kernel_initializer = 'uniform', activation = 'relu'))
# Adding the output layer
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
classifier.summary()
In [16]:
x_val = X_train[-5000:]
y_val = Y_train[-5000:]
X_train = X_train[:-5000]
Y_train = Y_train[:-5000]
In [17]:
x_val
Out[17]:
Training¶
In [18]:
# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
# Fitting the ANN to the Training set
history = classifier.fit(X_train, Y_train, epochs = 20, validation_data = (x_val, y_val), verbose=1)
In [19]:
fig, ax1 = plt.subplots(figsize= (10, 5))
plt.plot(history.history["accuracy"])
plt.plot(history.history["val_accuracy"])
plt.title("Model accuracy")
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend(["Train", "Validation"], loc = "upper left")
plt.show()
In [20]:
fig, ax1 = plt.subplots(figsize= (10, 5))
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.title("Model loss")
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.legend(["Train", "Validation"], loc = "upper left")
plt.show()
In [21]:
Y_train
Out[21]:
Evaluation¶
In [22]:
from sklearn.metrics import confusion_matrix
cnn_predictions = classifier.predict_classes(X_test)
confusion_matrix = confusion_matrix(Y_test, cnn_predictions)
sns.heatmap(confusion_matrix, annot=True, fmt="d", cbar = False)
plt.title("CNN Confusion Matrix")
plt.show()
In [23]:
yhat_train = (classifier.predict(X_train) > 0.5)
yhat_test = (classifier.predict(X_test) > 0.5)
In [24]:
#pred_ann
In [25]:
#Validate Models
from sklearn import metrics
accuracy = metrics.accuracy_score(Y_test, yhat_test)
#confusion_matrix = metrics.confusion_matrix(Y_test, yhat_test)
classification = metrics.classification_report(Y_test, yhat_test)
print()
print('============================== ANN Model Test Results ==============================')
print()
print ("Model Accuracy:" "\n", accuracy)
print()
#print("Confusion matrix:" "\n", confusion_matrix)
print()
print("Classification report:" "\n", classification)
print()
In [26]:
# PREDICTING FOR TEST DATA
pred_ann = classifier.predict(test_df)
In [27]:
pred_ann[pred_ann > 0.5] = 1
pred_ann[pred_ann < 0.5] = 0
In [28]:
pred_ann
Out[28]:
In [29]:
for x in pred_ann[:100]:
if x == 0:
print("Anomaly")
else:
print("Normal")
In [30]:
classifier.save('EDS.h5')
DeepCC¶
In [31]:
!deepCC EDS.h5
In [ ]: