# Article category classification¶

Too many documents, but what are they about? It is a tiring task to go through documents or files and categorise them.

import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, multilabel_confusion_matrix, f1_score
import random
import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from wordcloud import WordCloud
# stopwords

[nltk_data] Downloading package stopwords to /home/jupyter-
[nltk_data]     Rodio346/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

# Dataset¶

There are two CSV files, train and test with article titles, abstracts and the subjects they talk about.

The articles can belong to more than 1 subject.

df = pd.read_csv('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/researchArticles.csv')
df

ID TITLE ABSTRACT Computer Science Physics Mathematics Statistics Quantitative Biology Quantitative Finance
0 1 Reconstructing Subject-Specific Effect Maps Predictive models allow subject-specific inf... 1 0 0 0 0 0
1 2 Rotation Invariance Neural Network Rotation invariance and translation invarian... 1 0 0 0 0 0
2 3 Spherical polyharmonics and Poisson kernels fo... We introduce and develop the notion of spher... 0 0 1 0 0 0
3 4 A finite element approximation for the stochas... The stochastic Landau--Lifshitz--Gilbert (LL... 0 0 1 0 0 0
4 5 Comparative study of Discrete Wavelet Transfor... Fourier-transform infra-red (FTIR) spectra o... 1 0 0 1 0 0
... ... ... ... ... ... ... ... ... ...
20967 20968 Contemporary machine learning: a guide for pra... Machine learning is finding increasingly bro... 1 1 0 0 0 0
20968 20969 Uniform diamond coatings on WC-Co hard alloy c... Polycrystalline diamond coatings have been g... 0 1 0 0 0 0
20969 20970 Analysing Soccer Games with Clustering and Con... We present a new approach for identifying si... 1 0 0 0 0 0
20970 20971 On the Efficient Simulation of the Left-Tail o... The sum of Log-normal variates is encountere... 0 0 1 1 0 0
20971 20972 Why optional stopping is a problem for Bayesians Recently, optional stopping has been a subje... 0 0 1 1 0 0

20972 rows × 9 columns

# Columns in the dataset
df.columns

Index(['ID', 'TITLE', 'ABSTRACT', 'Computer Science', 'Physics', 'Mathematics',
'Statistics', 'Quantitative Biology', 'Quantitative Finance'],
dtype='object')
# Defining the list of subjects

subjects = ['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']

# Distribution of subject values

for subject in subjects:
print(subject, '-', list(df[subject]).count(1))
print()

Computer Science - 8594

Physics - 6013

Mathematics - 5618

Statistics - 5206

Quantitative Biology - 587

Quantitative Finance - 249



It is an unbalanced dataset.

# Data preprocessing¶

# Remove URLs
def removeURL(sentence):
regex = re.compile('http[s]?://\S+')
return re.sub(regex, ' ', sentence)

# remove numbers, punctuation and any special characters (keep only alphabets)
def onlyAlphabets(sentence):
regex = re.compile('[^a-zA-Z]')
return re.sub(regex, ' ', sentence)

# Defining stopwords
stop = nltk.corpus.stopwords.words('english')
#stop.remove('not')

print(len(stop))

179

sno = nltk.stem.SnowballStemmer('english')    # Initializing stemmer

subject_words = [[], [], [], [], [], []]

all_text = []

#print(len(df))

for x in range(len(df)):
#print(x)
title = df['TITLE'].values[x]
abstract = df['ABSTRACT'].values[x]

s = df[subjects].values[x]
s_num = np.where(s == 1)[0]

cleaned_text = []

title = removeURL(title)
title = onlyAlphabets(title)
title = title.lower()

abstract = removeURL(abstract)
abstract = onlyAlphabets(abstract)
abstract = abstract.lower()

for word in title.split():
if word not in stop:
stemmed = sno.stem(word)
cleaned_text.append(stemmed)

for si in s_num:
subject_words[si].append(word)

for word in abstract.split():
if word not in stop:
stemmed = sno.stem(word)
cleaned_text.append(stemmed)

for si in s_num:
subject_words[si].append(word)

all_text.append(' '.join(cleaned_text))

#pick only required columns
df = df[subjects]

# add as column in dataframe
df['Cleaned_text'] = all_text

df

Computer Science Physics Mathematics Statistics Quantitative Biology Quantitative Finance Cleaned_text
0 1 0 0 0 0 0 reconstruct subject specif effect map predict ...
1 1 0 0 0 0 0 rotat invari neural network rotat invari trans...
2 0 0 1 0 0 0 spheric polyharmon poisson kernel polyharmon f...
3 0 0 1 0 0 0 finit element approxim stochast maxwel landau ...
4 1 0 0 1 0 0 compar studi discret wavelet transform wavelet...
... ... ... ... ... ... ... ...
20967 1 1 0 0 0 0 contemporari machin learn guid practition phys...
20968 0 1 0 0 0 0 uniform diamond coat wc co hard alloy cut inse...
20969 1 0 0 0 0 0 analys soccer game cluster conceptor present n...
20970 0 0 1 1 0 0 effici simul left tail sum correl log normal v...
20971 0 0 1 1 0 0 option stop problem bayesian recent option sto...

20972 rows × 7 columns

df.to_csv('cleaned.csv', index=False)


# Visualization¶

plt.figure(figsize=(40,40))

for i in range(len(subjects)):
ax = plt.subplot(len(subjects), 1, i + 1)
plt.imshow(WordCloud().generate(' '.join(subject_words[i])))
plt.title(subjects[i])
plt.axis("off")