NOTE: This Use Case is not purposed for resource constrained devices.
Question classification¶
Credit: AITS Cainvas Community
Photo by Mike Mirandi on Dribbble
Finding the intent of the question asked, i.e., the type of answer to be given.
In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.metrics import confusion_matrix, f1_score
from tensorflow.keras import models, layers, optimizers, losses, callbacks
In [2]:
df = pd.read_csv('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/Question_Classification_Dataset.csv')
df
Out[2]:
Preprocessing¶
Dropping unwanted columns¶
In [3]:
df = df.drop(columns = ['Unnamed: 0', 'Category1', 'Category2'])
df
Out[3]:
Target labels¶
In [4]:
df['Category0'].value_counts()
Out[4]:
It is not a balanced dataset. But we will go ahead with this.
One hot encoding¶
The labels are not range dependent and are thus one hot encoded.
In [5]:
y = pd.get_dummies(df['Category0'])
In [6]:
class_names = list(y.columns)
class_names
Out[6]:
Text cleaning¶
In [7]:
# Remove html tags
def removeHTML(sentence):
regex = re.compile('<.*?>')
return re.sub(regex, ' ', sentence)
# Remove URLs
def removeURL(sentence):
regex = re.compile('http[s]?://\S+')
return re.sub(regex, ' ', sentence)
# remove numbers, punctuation and any special characters (keep only alphabets)
def onlyAlphabets(sentence):
regex = re.compile('[^a-zA-Z]')
return re.sub(regex, ' ', sentence)
In [8]:
sno = nltk.stem.SnowballStemmer('english') # Initializing stemmer
wordcloud = [[], [], [], [], [], [], []]
all_sentences = [] # All cleaned sentences
for x in range(len(df['Questions'].values)):
question = df['Questions'].values[x]
classname = df['Category0'].values[x]
cleaned_sentence = []
sentence = removeURL(question)
sentence = removeHTML(sentence)
sentence = onlyAlphabets(sentence)
sentence = sentence.lower()
for word in sentence.split():
#if word not in stop:
stemmed = sno.stem(word)
cleaned_sentence.append(stemmed)
wordcloud[class_names.index(classname)].append(word)
all_sentences.append(' '.join(cleaned_sentence))
# add as column in dataframe
X = all_sentences
Visualization¶
In [9]:
plt.figure(figsize=(40,40))
for i in range(len(class_names)):
ax = plt.subplot(len(class_names), 1, i + 1)
plt.imshow(WordCloud().generate(' '.join(wordcloud[i])))
plt.title(class_names[i])
plt.axis("off")
Train test split¶
In [10]:
# Splitting into train and val set -- 80-20 split
Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size = 0.2)
Tokenization¶
In [11]:
# Tokenization
vocab = 1500
mlen = 200
tokenizer = Tokenizer(num_words = vocab, oov_token = '<UNK>')
tokenizer.fit_on_texts(Xtrain)
Xtrain = tokenizer.texts_to_sequences(Xtrain)
Xtrain = pad_sequences(Xtrain, maxlen=mlen)
Xval = tokenizer.texts_to_sequences(Xval)
Xval = pad_sequences(Xval, maxlen=mlen)
The model¶
In [12]:
# Build and train neural network
embedding_dim = 128
model = models.Sequential([
layers.Embedding(vocab, embedding_dim, input_length = mlen),
layers.LSTM(128, activation='tanh'),
layers.Dense(64, activation = 'relu'),
layers.Dense(32, activation = 'relu'),
layers.Dense(len(class_names), activation = 'softmax')
])
cb = [callbacks.EarlyStopping(patience = 5, restore_best_weights = True)]
model.summary()
In [13]:
model.compile(optimizer = optimizers.Adam(0.01), loss = losses.CategoricalCrossentropy(), metrics = ['accuracy'])
history = model.fit(Xtrain, ytrain, batch_size=64, epochs = 256, validation_data=(Xval, yval), callbacks = cb)
In [14]:
model.evaluate(Xval, yval)
print("F1 score: ", f1_score(np.argmax(yval.to_numpy(), axis = 1), np.argmax(model.predict(Xval), axis = 1), average = 'weighted'))
In [15]:
cm = confusion_matrix(np.argmax(yval.to_numpy(), axis = 1), np.argmax(model.predict(Xval), axis = 1))
cm = cm.astype('int') / cm.sum(axis=1)[:, np.newaxis]
fig = plt.figure(figsize = (10, 10))
ax = fig.add_subplot(111)
for i in range(cm.shape[1]):
for j in range(cm.shape[0]):
if cm[i,j] > 0.8:
clr = "white"
else:
clr = "black"
ax.text(j, i, format(cm[i, j], '.2f'), horizontalalignment="center", color=clr)
_ = ax.imshow(cm, cmap=plt.cm.Blues)
ax.set_xticks(range(len(class_names)))
ax.set_yticks(range(len(class_names)))
ax.set_xticklabels(class_names, rotation = 90)
ax.set_yticklabels(class_names)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
Higher samples in the abbreviation category will help in achieveing better accuracy.
Plotting the metrics¶
In [16]:
def plot(history, variable, variable2):
plt.plot(range(len(history[variable])), history[variable])
plt.plot(range(len(history[variable2])), history[variable2])
plt.legend([variable, variable2])
plt.title(variable)
In [17]:
plot(history.history, "accuracy", 'val_accuracy')
In [18]:
plot(history.history, "loss", 'val_loss')
Prediction¶
In [19]:
x = np.random.randint(0, Xval.shape[0] - 1)
question = df['Questions'].values[x]
print("Question: ", question)
cleaned_text = []
sentence = removeURL(question)
sentence = removeHTML(sentence)
sentence = onlyAlphabets(sentence)
sentence = sentence.lower()
for word in sentence.split():
#if word not in stop:
stemmed = sno.stem(word)
cleaned_text.append(stemmed)
cleaned_text = [' '.join(cleaned_text)]
print("Cleaned text: ", cleaned_text[0])
cleaned_text = tokenizer.texts_to_sequences(cleaned_text)
cleaned_text = pad_sequences(cleaned_text, maxlen=mlen)
category = df['Category0'].values[x]
print("\nTrue category: ", category)
output = model.predict(cleaned_text)[0]
pred = np.argmax(output)
print("\nPredicted category: ", class_names[pred])
print("Probability: ", output[pred])
deepc¶
In [20]:
model.save('question.h5')
!deepCC question.h5