NOTE: This Use Case is not purposed for resource constrained devices.
Question classification¶
Credit: AITS Cainvas Community
Photo by Mike Mirandi on Dribbble
Finding the intent of the question asked, i.e., the type of answer to be given.
In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.metrics import confusion_matrix, f1_score
from tensorflow.keras import models, layers, optimizers, losses, callbacks
In [2]:
df = pd.read_csv('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/Question_Classification_Dataset.csv')
df
Out[2]:
Preprocessing¶
Dropping unwanted columns¶
In [3]:
df = df.drop(columns = ['Unnamed: 0', 'Category1', 'Category2'])
df
Out[3]:
Target labels¶
In [4]:
df['Category0'].value_counts()
Out[4]:
It is not a balanced dataset. But we will go ahead with this.
One hot encoding¶
The labels are not range dependent and are thus one hot encoded.
In [5]:
y = pd.get_dummies(df['Category0'])
In [6]:
class_names = list(y.columns)
class_names
Out[6]:
Text cleaning¶
In [7]:
# Remove html tags
def removeHTML(sentence):
regex = re.compile('<.*?>')
return re.sub(regex, ' ', sentence)
# Remove URLs
def removeURL(sentence):
regex = re.compile('http[s]?://\S+')
return re.sub(regex, ' ', sentence)
# remove numbers, punctuation and any special characters (keep only alphabets)
def onlyAlphabets(sentence):
regex = re.compile('[^a-zA-Z]')
return re.sub(regex, ' ', sentence)
In [8]:
sno = nltk.stem.SnowballStemmer('english') # Initializing stemmer
wordcloud = [[], [], [], [], [], [], []]
all_sentences = [] # All cleaned sentences
for x in range(len(df['Questions'].values)):
question = df['Questions'].values[x]
classname = df['Category0'].values[x]
cleaned_sentence = []
sentence = removeURL(question)
sentence = removeHTML(sentence)
sentence = onlyAlphabets(sentence)
sentence = sentence.lower()
for word in sentence.split():
#if word not in stop:
stemmed = sno.stem(word)
cleaned_sentence.append(stemmed)
wordcloud[class_names.index(classname)].append(word)
all_sentences.append(' '.join(cleaned_sentence))
# add as column in dataframe
X = all_sentences
Visualization¶
In [9]:
plt.figure(figsize=(40,40))
for i in range(len(class_names)):
ax = plt.subplot(len(class_names), 1, i + 1)
plt.imshow(WordCloud().generate(' '.join(wordcloud[i])))
plt.title(class_names[i])
plt.axis("off")