NOTE: This Use Case is not purposed for resource constrained devices.
In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
In [2]:
# importing the dataset
df = pd.read_csv('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/dataset.csv')
df.head()
Out[2]:
In [3]:
# Drop the Nan Values
df=df.dropna()
In [4]:
df.label.value_counts()
Out[4]:
In [5]:
# Class count
count_class_0, count_class_1 = df.label.value_counts()
# Divide by class
df_class_0 = df[df['label'] == 0]
df_class_1 = df[df['label'] == 1]
In [6]:
# Oversample 1-class and concat the DataFrames of both classes
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)
print(df_test_over.label.value_counts())
X = df_test_over.drop('label',axis='columns')
y = df_test_over['label']
In [7]:
X.shape, y.shape
Out[7]:
In [24]:
from tensorflow.keras import callbacks
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout
Onehot Representation¶
In [9]:
# Vocabulary size
voc_size=30
In [10]:
X_copy=X.copy()
In [11]:
X_copy.reset_index(inplace=True)
In [12]:
import nltk
import re
from nltk.corpus import stopwords
In [13]:
nltk.download('stopwords')
Out[13]:
In [14]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(X_copy)):
review = re.sub('[^a-zA-Z]', ' ', X_copy['title'][i])
review = review.lower()
review = review.split()
review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
review = ' '.join(review)
corpus.append(review)
In [15]:
corpus[:20]
Out[15]:
In [16]:
onehot_repr=[one_hot(words,voc_size)for words in corpus]
onehot_repr[:20]
Out[16]:
Embedding Representation¶
In [17]:
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)
In [18]:
embedded_docs[0]
Out[18]:
Building the model¶
In [19]:
embedding_vector_features=40
model1=Sequential()
model1.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model1.add(Dropout(0.7))
model1.add(Bidirectional(LSTM(100)))
model1.add(Dropout(0.7))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model1.summary()
In [20]:
X_final=np.array(embedded_docs)
y_final=np.array(y)
In [21]:
X_final.shape,y_final.shape
Out[21]:
In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42, stratify=y)
Training Model¶
In [25]:
model_name = "fake_news.h5"
cb = [
callbacks.EarlyStopping(monitor = 'val_loss', patience = 5, restore_best_weights = True),
callbacks.ModelCheckpoint(model_name, monitor = "val_loss", save_best_only = True)
]
history = model1.fit(X_train, y_train, validation_data=(X_test,y_test), epochs=50, batch_size=64, callbacks = cb)
In [27]:
from matplotlib import pyplot as plt
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
In [26]:
from matplotlib import pyplot as plt
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
Predictions¶
In [28]:
y_pred1=model1.predict_classes(X_test)
In [29]:
from sklearn.metrics import confusion_matrix
In [30]:
confusion_matrix(y_test,y_pred1)
Out[30]:
In [31]:
import seaborn as sn
from matplotlib import pyplot as plt
cm = tf.math.confusion_matrix(labels = y_test, predictions = y_pred1)
plt.figure(figsize = (10,7))
sn.heatmap(cm, annot = True, fmt = 'd')
plt.xlabel("predicted")
plt.ylabel("actual")
Out[31]:
Accuracy of the Model¶
In [32]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred1)
Out[32]:
In [33]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred1))
In [34]:
# importing the test data
In [35]:
df_test = pd.read_csv('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/test.csv')
In [36]:
df_test[:5]
Out[36]:
In [37]:
df_test = df_test.reset_index(drop=True)
In [38]:
df_test1=np.array(embedded_docs)
making Predictions for test data¶
In [39]:
y_pred2=model1.predict_classes(df_test1)
In [40]:
y_pred2 = pd.DataFrame(y_pred2, columns=['lables'])
In [41]:
df_final_0 = pd.concat([df_test['id'], y_pred2], axis = 1)
In [42]:
df_final_0.to_csv('Predictions')
In [43]:
df_final_0.head(10)
Out[43]: