Article category classification¶
Credit: AITS Cainvas Community
Photo by Mogilev Konstantin on Dribbble
What is this article talking about?¶
Too many documents, but what are they about? It is a tiring task to go through documents or files and categorise them.
In [1]:
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, multilabel_confusion_matrix, f1_score
import random
import keras
from keras.models import Sequential
from keras.layers import Dense, Embedding, GlobalAveragePooling1D
from keras.optimizers import Adam
from keras.losses import BinaryCrossentropy
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping, ModelCheckpoint
from wordcloud import WordCloud
# stopwords
nltk.download('stopwords')
Out[1]:
Dataset¶
There are two CSV files, train and test with article titles, abstracts and the subjects they talk about.
The articles can belong to more than 1 subject.
In [2]:
df = pd.read_csv('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/researchArticles.csv')
df
Out[2]:
In [3]:
# Columns in the dataset
df.columns
Out[3]:
In [4]:
# Defining the list of subjects
subjects = ['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']
In [5]:
# Distribution of subject values
for subject in subjects:
print(subject, '-', list(df[subject]).count(1))
print()
It is an unbalanced dataset.
Data preprocessing¶
In [6]:
# Remove URLs
def removeURL(sentence):
regex = re.compile('http[s]?://\S+')
return re.sub(regex, ' ', sentence)
In [7]:
# remove numbers, punctuation and any special characters (keep only alphabets)
def onlyAlphabets(sentence):
regex = re.compile('[^a-zA-Z]')
return re.sub(regex, ' ', sentence)
In [8]:
# Defining stopwords
stop = nltk.corpus.stopwords.words('english')
#stop.remove('not')
print(len(stop))
In [9]:
sno = nltk.stem.SnowballStemmer('english') # Initializing stemmer
subject_words = [[], [], [], [], [], []]
all_text = []
#print(len(df))
for x in range(len(df)):
#print(x)
title = df['TITLE'].values[x]
abstract = df['ABSTRACT'].values[x]
s = df[subjects].values[x]
s_num = np.where(s == 1)[0]
cleaned_text = []
title = removeURL(title)
title = onlyAlphabets(title)
title = title.lower()
abstract = removeURL(abstract)
abstract = onlyAlphabets(abstract)
abstract = abstract.lower()
for word in title.split():
if word not in stop:
stemmed = sno.stem(word)
cleaned_text.append(stemmed)
for si in s_num:
subject_words[si].append(word)
for word in abstract.split():
if word not in stop:
stemmed = sno.stem(word)
cleaned_text.append(stemmed)
for si in s_num:
subject_words[si].append(word)
all_text.append(' '.join(cleaned_text))
#pick only required columns
df = df[subjects]
# add as column in dataframe
df['Cleaned_text'] = all_text
In [10]:
df
Out[10]:
In [11]:
df.to_csv('cleaned.csv', index=False)
Visualization¶
In [12]:
plt.figure(figsize=(40,40))
for i in range(len(subjects)):
ax = plt.subplot(len(subjects), 1, i + 1)
plt.imshow(WordCloud().generate(' '.join(subject_words[i])))
plt.title(subjects[i])
plt.axis("off")