Spam text classification¶
Credit: AITS Cainvas Community
Photo by Emanuele Colombo on Dribbble
Identifying whether the given text is spam or not (ham). This helps in filtering through unnecessary text content and keep us focussed on the important information.
Importing necessary libraries¶
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import re
import matplotlib.pyplot as plt
from tensorflow.keras import layers, optimizers, losses, callbacks, models
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
import random
from wordcloud import WordCloud
# stopwords
nltk.download('stopwords')
The dataset¶
Almeida, T.A., Gómez Hidalgo, J.M., Yamakami, A. Contributions to the Study of SMS Spam Filtering: New Collection and Results. Proceedings of the 2011 ACM Symposium on Document Engineering (DOCENG'11), Mountain View, CA, USA, 2011. Website | UCI
The dataset is a CSV file with messages falling into one of two categories - ham and spam.
df = pd.read_csv('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/SPAM_text_message_20170820_-_Data.csv')
df
Preprocessing¶
Dropping repeated rows¶
# Distribution of score values
df['Category'].value_counts()
df = df.drop_duplicates()
df['Category'].value_counts()
It is not a balanced dataset but we will go forward with it.
Encoding the category values¶
# Labels as 1 - spam or 0 - ham
df['Category'] = df['Category'].apply(lambda x : 1 if x == 'spam' else 0)
df
Data cleaning¶
# Remove html tags
def removeHTML(sentence):
regex = re.compile('<.*?>')
return re.sub(regex, ' ', sentence)
# Remove URLs
def removeURL(sentence):
regex = re.compile('http[s]?://\S+')
return re.sub(regex, ' ', sentence)
# remove numbers, punctuation and any special characters (keep only alphabets)
def onlyAlphabets(sentence):
regex = re.compile('[^a-zA-Z]')
return re.sub(regex, ' ', sentence)
def removeRecurring(sentence):
return re.sub(r'(.)\1{2,}', r'\1', sentence)
# Defining stopwords
stop = nltk.corpus.stopwords.words('english')
sno = nltk.stem.SnowballStemmer('english') # Initializing stemmer
spam = [] # All words in positive reviews
ham = [] # All words in negative reviews
all_sentences = [] # All cleaned sentences
for x in range(len(df['Message'].values)):
review = df['Message'].values[x]
rating = df['Category'].values[x]
cleaned_sentence = []
sentence = removeURL(review)
sentence = removeHTML(sentence)
sentence = onlyAlphabets(sentence)
sentence = sentence.lower()
sentence = removeRecurring(sentence)
for word in sentence.split():
#if word not in stop:
stemmed = sno.stem(word)
cleaned_sentence.append(stemmed)
if rating == 1 :
spam.append(stemmed)
else:
ham.append(stemmed)
all_sentences.append(' '.join(cleaned_sentence))
# add as column in dataframe
df['Cleaned'] = all_sentences
Visualization¶
plt.figure(figsize=(20,20))
plt.imshow(WordCloud().generate(' '.join(spam)))
plt.figure(figsize=(20,20))
plt.imshow(WordCloud().generate(' '.join(ham)))
# Splitting into train, val and test set -- 80-10-10 split
# First, an 80-20 split
train_df, val_test_df = train_test_split(df, test_size = 0.2, random_state = 113)
# Then split the 20% into half
val_df, test_df = train_test_split(val_test_df, test_size = 0.5, random_state = 113)
print("Number of samples in...")
print("Training set: ", len(train_df))
print("Validation set: ", len(val_df))
print("Testing set: ", len(test_df))
Tokenization¶
cv = CountVectorizer(ngram_range = (1,1), max_features=20000)
train_bow = cv.fit_transform(train_df['Cleaned'])
val_bow = cv.transform(val_df['Cleaned'])
test_bow = cv.transform(test_df['Cleaned'])
tfidf = TfidfTransformer().fit(train_bow)
train_tf = tfidf.transform(train_bow)
val_tf = tfidf.transform(val_bow)
test_tf = tfidf.transform(test_bow)
Defining the input and output¶
Xtrain = train_tf.toarray()
ytrain = train_df['Category']
Xval = val_tf.toarray()
yval = val_df['Category']
ytest = test_df['Category']
Xtest = test_tf.toarray()
The model¶
Here we implement a model based on the frequency of different words in the sentence.
model = models.Sequential([
layers.Dense(16, activation = 'relu', input_shape = Xtrain[0].shape),
layers.Dense(4, activation = 'relu'),
layers.Dense(1, activation = 'sigmoid')
])
cb = [callbacks.EarlyStopping(patience = 5, restore_best_weights = True)]
model.summary()
model.compile(optimizer = optimizers.Adam(0.0001), loss = losses.BinaryCrossentropy(), metrics = ['accuracy'])
history = model.fit(Xtrain, ytrain, validation_data = (Xval, yval), epochs = 128, callbacks = cb)
model.evaluate(Xtest, ytest)
print("F1 score - ", f1_score(ytest, (model.predict(Xtest)>0.5).astype('int')))
ytest_val = ['spam' if i == 1 else 'ham' for i in ytest]
ypred = (model.predict(Xtest)>0.5).astype('int')
ypred_val = ['spam' if i == 1 else 'ham' for i in ypred]
cm = confusion_matrix(ytest_val, ypred_val, labels=['spam', 'ham'])
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
fig = plt.figure(figsize = (4, 4))
ax = fig.add_subplot(111)
for i in range(cm.shape[1]):
for j in range(cm.shape[0]):
ax.text(j, i, format(cm[i, j], '.2f'), horizontalalignment="center", color="black")
columns = ['ham', 'spam']
_ = ax.imshow(cm, cmap=plt.cm.Blues)
ax.set_xticks(range(2))
ax.set_yticks(range(2))
ax.set_xticklabels(columns, rotation = 90)
ax.set_yticklabels(columns)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
A significant percentage of the ham messages are classified as spam. This can be improved with a larger dataset that includes more spam samples.
Plotting the metrics¶
def plot(history, variable, variable2):
plt.plot(range(len(history[variable])), history[variable])
plt.plot(range(len(history[variable2])), history[variable2])
plt.title(variable)
plot(history.history, "accuracy", 'val_accuracy')
plot(history.history, "loss", "val_loss")
Prediction¶
x = np.random.randint(0, Xtest.shape[0] - 1)
sentence = test_df['Message'].values[x]
print("Sentence: ", sentence)
cleaned_sentence = []
sentence = removeURL(sentence)
sentence = removeHTML(sentence)
sentence = onlyAlphabets(sentence)
sentence = sentence.lower()
sentence = removeRecurring(sentence)
for word in sentence.split():
#if word not in stop:
stemmed = sno.stem(word)
cleaned_sentence.append(stemmed)
sentence = [' '.join(cleaned_sentence)]
print("\nCleaned sentence: ", sentence[0])
sentence = cv.transform(sentence)
sentence = tfidf.transform(sentence)
print("\nTrue value: ", columns[test_df['Category'].values[x]])
pred = model.predict(sentence.toarray())[0][0]
print("\nPredicted value: ", columns[int(pred>0.5)], "(", pred, "-->", (pred>0.5).astype('int'), ")")
deepC¶
model.save('spam_text.h5')
!deepCC spam_text.h5
x = np.random.randint(0, Xtest.shape[0] - 1)
sentence = test_df['Message'].values[x]
print("Sentence: ", sentence)
cleaned_sentence = []
sentence = removeURL(sentence)
sentence = removeHTML(sentence)
sentence = onlyAlphabets(sentence)
sentence = sentence.lower()
sentence = removeRecurring(sentence)
for word in sentence.split():
if word not in stop:
stemmed = sno.stem(word)
cleaned_sentence.append(stemmed)
sentence = [' '.join(cleaned_sentence)]
print("\nCleaned sentence: ", sentence[0])
sentence = cv.transform(sentence)
sentence = tfidf.transform(sentence)
print()
np.savetxt('sample.data', sentence.toarray()) # xth sample into text file
# run exe with input
!spam_text_deepC/spam_text.exe sample.data
# show predicted output
nn_out = np.loadtxt('deepSea_result_1.out')
pred = (nn_out>0.5).astype('int')
print("\nPredicted value: ", columns[int(pred>0.5)], "(", pred, "-->", (pred>0.5).astype('int'), ")")
print("\nTrue value: ", columns[test_df['Category'].values[x]])