NOTE: This Use Case is not purposed for resource constrained devices.
Sarcasm detection in news headlines¶
Credit: AITS Cainvas Community
Photo by Su for RaDesign on Dribbble
Sarcasm has the ability to flip the sentiment of the sentence.Thus makes sarcasm detection an important part of sentiment analysis.
In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.metrics import confusion_matrix, f1_score
from tensorflow.keras import models, layers, optimizers, losses, callbacks
Dataset¶
This dataset is collected from two news websites. The Onion aims at producing sarcastic versions of current events and the headlines from News in Brief and News in Photos categories (which are sarcastic) were collected. Also, real (and non-sarcastic) news headlines were collected from HuffPost.
In [2]:
df = pd.read_json('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/Sarcasm_Headlines_Dataset_v2.json',
lines = True)
df
Out[2]:
Distribution of values in classes -
In [3]:
df['is_sarcastic'].value_counts()
Out[3]:
It is an almost balanced dataset.
Data preprocessing¶
In [4]:
# Remove html tags
def removeHTML(sentence):
regex = re.compile('<.*?>')
return re.sub(regex, ' ', sentence)
# Remove URLs
def removeURL(sentence):
regex = re.compile('http[s]?://\S+')
return re.sub(regex, ' ', sentence)
# remove numbers, punctuation and any special characters (keep only alphabets)
def onlyAlphabets(sentence):
regex = re.compile('[^a-zA-Z]')
return re.sub(regex, ' ', sentence)
In [5]:
sno = nltk.stem.SnowballStemmer('english') # Initializing stemmer
wordcloud = [[], []]
all_sentences = [] # All cleaned sentences
for x in range(len(df['headline'].values)):
headline = df['headline'].values[x]
sarcasm = df['is_sarcastic'].values[x]
cleaned_sentence = []
sentence = removeURL(headline)
sentence = removeHTML(sentence)
sentence = onlyAlphabets(sentence)
sentence = sentence.lower()
for word in sentence.split():
#if word not in stop:
stemmed = sno.stem(word)
cleaned_sentence.append(stemmed)
wordcloud[sarcasm].append(word)
all_sentences.append(' '.join(cleaned_sentence))
# add as column in dataframe
X = all_sentences
y = df['is_sarcastic']
In [6]:
class_names = ['Not sarcastic', 'Sarcastic']
Visualization¶
In [7]:
plt.figure(figsize=(10,10))
for i in range(len(class_names)):
ax = plt.subplot(len(class_names), 1, i + 1)
plt.imshow(WordCloud().generate(' '.join(wordcloud[i])))
plt.title(class_names[i])
plt.axis("off")
Train - val split¶
In [8]:
# Splitting into train and val set -- 80-20 split
Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size = 0.2)
In [9]:
# Tokenization
vocab = 1500
mlen = 200
tokenizer = Tokenizer(num_words = vocab, oov_token = '<UNK>')
tokenizer.fit_on_texts(Xtrain)
Xtrain = tokenizer.texts_to_sequences(Xtrain)
Xtrain = pad_sequences(Xtrain, maxlen=mlen)
Xval = tokenizer.texts_to_sequences(Xval)
Xval = pad_sequences(Xval, maxlen=mlen)
The model¶
In [10]:
# Build and train neural network
embedding_dim = 128
model = models.Sequential([
layers.Embedding(vocab, embedding_dim, input_length = mlen),
layers.LSTM(128, activation='tanh'),
layers.Dense(32, activation = 'relu'),
layers.Dense(16, activation = 'relu'),
layers.Dense(1, activation = 'sigmoid')
])
cb = [callbacks.EarlyStopping(patience = 5, restore_best_weights = True)]
In [11]:
model.summary()
In [12]:
model.compile(optimizer = optimizers.Adam(0.01), loss = losses.BinaryCrossentropy(), metrics = ['accuracy'])
history = model.fit(Xtrain, ytrain, batch_size=64, epochs = 256, validation_data=(Xval, yval), callbacks = cb)
In [13]:
model.evaluate(Xval, yval)
Out[13]:
In [14]:
cm = confusion_matrix(yval, (model.predict(Xval)>0.5).astype('int64'))
cm = cm.astype('int') / cm.sum(axis=1)[:, np.newaxis]
fig = plt.figure(figsize = (5, 5))
ax = fig.add_subplot(111)
for i in range(cm.shape[1]):
for j in range(cm.shape[0]):
if cm[i,j] > 0.8:
clr = "white"
else:
clr = "black"
ax.text(j, i, format(cm[i, j], '.2f'), horizontalalignment="center", color=clr)
_ = ax.imshow(cm, cmap=plt.cm.Blues)
ax.set_xticks(range(len(class_names)))
ax.set_yticks(range(len(class_names)))
ax.set_xticklabels(class_names, rotation = 90)
ax.set_yticklabels(class_names)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
Plotting the metrics¶
In [15]:
def plot(history, variable, variable2):
plt.plot(range(len(history[variable])), history[variable])
plt.plot(range(len(history[variable2])), history[variable2])
plt.legend([variable, variable2])
plt.title(variable)
In [16]:
plot(history.history, "accuracy", 'val_accuracy')
In [17]:
plot(history.history, "loss", 'val_loss')
Prediction¶
In [18]:
x = np.random.randint(0, Xval.shape[0] - 1)
headline = df['headline'].values[x]
print("Headline: ", headline)
cleaned_text = []
sentence = removeURL(headline)
sentence = removeHTML(sentence)
sentence = onlyAlphabets(sentence)
sentence = sentence.lower()
for word in sentence.split():
#if word not in stop:
stemmed = sno.stem(word)
cleaned_text.append(stemmed)
cleaned_text = [' '.join(cleaned_text)]
print("Cleaned text: ", cleaned_text[0])
cleaned_text = tokenizer.texts_to_sequences(cleaned_text)
cleaned_text = pad_sequences(cleaned_text, maxlen=mlen)
category = df['is_sarcastic'].values[x]
print("\nTrue category: ", class_names[category])
output = model.predict(cleaned_text)[0][0]
pred = (output>0.5).astype('int64')
print("\nPredicted category: ", class_names[pred], "(", output, "-->", pred, ")")
deepC¶
In [20]:
model.save('sarcasm.h5')
!deepCC sarcasm.h5
In [ ]: