NOTE: This Use Case is not purposed for resource constrained devices.
Article category classification¶
Credit: AITS Cainvas Community
Photo by Mogilev Konstantin on Dribbble
What is this article talking about?¶
Too many documents, but what are they about? It is a tiring task to go through documents or files and categorise them.
In [1]:
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, multilabel_confusion_matrix, f1_score
import random
import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from wordcloud import WordCloud
# stopwords
nltk.download('stopwords')
Out[1]:
Dataset¶
There are two CSV files, train and test with article titles, abstracts and the subjects they talk about.
The articles can belong to more than 1 subject.
In [2]:
df = pd.read_csv('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/researchArticles.csv')
df
Out[2]:
In [3]:
# Columns in the dataset
df.columns
Out[3]:
In [4]:
# Defining the list of subjects
subjects = ['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']
In [5]:
# Distribution of subject values
for subject in subjects:
print(subject, '-', list(df[subject]).count(1))
print()
It is an unbalanced dataset.
Data preprocessing¶
In [6]:
# Remove URLs
def removeURL(sentence):
regex = re.compile('http[s]?://\S+')
return re.sub(regex, ' ', sentence)
In [7]:
# remove numbers, punctuation and any special characters (keep only alphabets)
def onlyAlphabets(sentence):
regex = re.compile('[^a-zA-Z]')
return re.sub(regex, ' ', sentence)
In [8]:
# Defining stopwords
stop = nltk.corpus.stopwords.words('english')
#stop.remove('not')
print(len(stop))
In [9]:
sno = nltk.stem.SnowballStemmer('english') # Initializing stemmer
subject_words = [[], [], [], [], [], []]
all_text = []
#print(len(df))
for x in range(len(df)):
#print(x)
title = df['TITLE'].values[x]
abstract = df['ABSTRACT'].values[x]
s = df[subjects].values[x]
s_num = np.where(s == 1)[0]
cleaned_text = []
title = removeURL(title)
title = onlyAlphabets(title)
title = title.lower()
abstract = removeURL(abstract)
abstract = onlyAlphabets(abstract)
abstract = abstract.lower()
for word in title.split():
if word not in stop:
stemmed = sno.stem(word)
cleaned_text.append(stemmed)
for si in s_num:
subject_words[si].append(word)
for word in abstract.split():
if word not in stop:
stemmed = sno.stem(word)
cleaned_text.append(stemmed)
for si in s_num:
subject_words[si].append(word)
all_text.append(' '.join(cleaned_text))
#pick only required columns
df = df[subjects]
# add as column in dataframe
df['Cleaned_text'] = all_text
In [10]:
df
Out[10]:
In [11]:
df.to_csv('cleaned.csv', index=False)
Visualization¶
In [12]:
plt.figure(figsize=(40,40))
for i in range(len(subjects)):
ax = plt.subplot(len(subjects), 1, i + 1)
plt.imshow(WordCloud().generate(' '.join(subject_words[i])))
plt.title(subjects[i])
plt.axis("off")
Data preprocessing continued...¶
In [13]:
df = pd.read_csv('cleaned.csv')
df
Out[13]:
In [14]:
# check for any null values
df.count()
Out[14]:
In [15]:
df = df.dropna()
df.count()
Out[15]:
In [16]:
# Definfing output columns
y = np.array(df[subjects])
In [17]:
input = "Cleaned_text"
X = df[input]
X
Out[17]:
In [18]:
split = int(0.8*len(df))
Xtrain, Xtest = X[:split], X[split:]
ytrain, ytest = y[:split], y[split:]
print("Train set - ", Xtrain.shape[0])
print("Test set - ", Xtest.shape[0])
In [19]:
# Tokenization
vocab = 15000
tokenizer = Tokenizer(num_words = vocab, oov_token = '<UNK>')
tokenizer.fit_on_texts(Xtrain)
word_index = tokenizer.word_index
# Padding
mlen = 600
padding_type = 'post'
trunc_type = 'post'
Xtrain = tokenizer.texts_to_sequences(Xtrain)
Xtrain = pad_sequences(Xtrain, maxlen=mlen, padding=padding_type, truncating=trunc_type)
Xtest = tokenizer.texts_to_sequences(Xtest)
Xtest = pad_sequences(Xtest, maxlen=mlen, padding=padding_type, truncating=trunc_type)
The model¶
In [20]:
# Build and train neural network
embedding_dim = 32
model = Sequential([
Embedding(vocab, embedding_dim, input_length = mlen),
GlobalAveragePooling1D(),
Dense(32, activation = 'relu'),
Dense(len(subjects), activation = 'sigmoid')
])
cb = [ModelCheckpoint('articles.h5', monitor = 'val_accuracy', save_best_only = True)]
model.summary()
In [21]:
model.compile(optimizer = Adam(0.1), loss = BinaryCrossentropy(), metrics = 'Accuracy')
history = model.fit(Xtrain, ytrain, batch_size=64, epochs = 256, validation_data=(Xtest, ytest), callbacks = cb)
In [22]:
model.load_weights('articles.h5')
In [23]:
ypred = model.predict(Xtest)
ypred = (ypred>0.5).astype('int')
acc = accuracy_score(ytest, ypred)
f1 = f1_score(ytest,ypred, average='samples')
print("Accuracy = ", acc)
print("F1 score = ", f1)
In [24]:
cm = multilabel_confusion_matrix(ytest, ypred)
plt.figure(figsize=(40,40))
for k in range(cm.shape[0]):
cmi = cm[k].astype('float') / cm[k].sum(axis=1)[:, np.newaxis]
ax = plt.subplot(len(subjects), 1, k+1)
for i in range(cmi.shape[1]):
for j in range(cmi.shape[0]):
plt.text(j, i, format(cmi[i, j], '.2f'), horizontalalignment="center", color="black")
plt.title(subjects[k])
plt.imshow(cmi, cmap=plt.cm.Blues)
The low level of true positives for quantitaive biology and quantitative finanace is due to the low number of training samples.
Plotting the metrics¶
In [25]:
def plot(history, variable, variable2):
plt.plot(range(len(history[variable])), history[variable])
plt.plot(range(len(history[variable2])), history[variable2])
plt.legend([variable, variable2])
plt.title(variable)
In [26]:
plot(history.history, "accuracy", 'val_accuracy')
In [27]:
plot(history.history, "loss", 'val_loss')
Prediction¶
In [28]:
df = pd.read_csv('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/researchArticles.csv')
In [29]:
x = np.random.randint(0, Xtest.shape[0] - 1)
title = df['TITLE'].values[x]
abstract = df['ABSTRACT'].values[x]
print("Title: ", title)
print("\nAbstract: ", abstract)
cleaned_text = []
title = removeURL(title)
title = onlyAlphabets(title)
title = title.lower()
abstract = removeURL(abstract)
abstract = onlyAlphabets(abstract)
abstract = abstract.lower()
for word in title.split():
if word not in stop:
stemmed = sno.stem(word)
cleaned_text.append(stemmed)
for word in abstract.split():
if word not in stop:
stemmed = sno.stem(word)
cleaned_text.append(stemmed)
cleaned_text = [' '.join(cleaned_text)]
print("Cleaned text: ", cleaned_text[0])
cleaned_text = tokenizer.texts_to_sequences(cleaned_text)
cleaned_text = pad_sequences(cleaned_text, maxlen=mlen, padding=padding_type, truncating=trunc_type)
s = df[subjects].values[x]
s_num = np.where(s == 1)[0]
print("\nTrue subjects: ")
for si in s_num:
print(subjects[si])
pred = model.predict(cleaned_text)[0]
predn = (pred>0.5).astype('int')
s_num = np.where(predn == 1)[0]
print("\nPredicted subjects: ")
for si in s_num:
print(subjects[si], '(', pred[si], ')')
deepC¶
In [31]:
#!deepCC articles.h5