Cainvas

Question classification

Credit: AITS Cainvas Community

Photo by Mike Mirandi on Dribbble

Finding the intent of the question asked, i.e., the type of answer to be given.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.metrics import confusion_matrix, f1_score
from tensorflow.keras import models, layers, optimizers, losses, callbacks

The dataset

On Kaggle by ARES

The dataset is a CSV file with questions and their corresponding categories and sub-categories.

In [2]:
df = pd.read_csv('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/Question_Classification_Dataset.csv')
df
Out[2]:
Unnamed: 0 Questions Category0 Category1 Category2
0 0 How did serfdom develop in and then leave Russ... DESCRIPTION DESC manner
1 1 What films featured the character Popeye Doyle ? ENTITY ENTY cremat
2 2 How can I find a list of celebrities ' real na... DESCRIPTION DESC manner
3 3 What fowl grabs the spotlight after the Chines... ENTITY ENTY animal
4 4 What is the full form of .com ? ABBREVIATION ABBR exp
... ... ... ... ... ...
5447 5447 What 's the shape of a camel 's spine ? ENTITY ENTY other
5448 5448 What type of currency is used in China ? ENTITY ENTY currency
5449 5449 What is the temperature today ? NUMERIC NUM temp
5450 5450 What is the temperature for cooking ? NUMERIC NUM temp
5451 5451 What currency is used in Australia ? ENTITY ENTY currency

5452 rows × 5 columns

Preprocessing

Dropping unwanted columns

In [3]:
df = df.drop(columns = ['Unnamed: 0', 'Category1', 'Category2'])
df
Out[3]:
Questions Category0
0 How did serfdom develop in and then leave Russ... DESCRIPTION
1 What films featured the character Popeye Doyle ? ENTITY
2 How can I find a list of celebrities ' real na... DESCRIPTION
3 What fowl grabs the spotlight after the Chines... ENTITY
4 What is the full form of .com ? ABBREVIATION
... ... ...
5447 What 's the shape of a camel 's spine ? ENTITY
5448 What type of currency is used in China ? ENTITY
5449 What is the temperature today ? NUMERIC
5450 What is the temperature for cooking ? NUMERIC
5451 What currency is used in Australia ? ENTITY

5452 rows × 2 columns

Target labels

In [4]:
df['Category0'].value_counts()
Out[4]:
ENTITY          1250
HUMAN           1223
DESCRIPTION     1162
NUMERIC          896
LOCATION         835
ABBREVIATION      86
Name: Category0, dtype: int64

It is not a balanced dataset. But we will go ahead with this.

One hot encoding

The labels are not range dependent and are thus one hot encoded.

In [5]:
y = pd.get_dummies(df['Category0'])
In [6]:
class_names = list(y.columns)

class_names
Out[6]:
['ABBREVIATION', 'DESCRIPTION', 'ENTITY', 'HUMAN', 'LOCATION', 'NUMERIC']

Text cleaning

In [7]:
# Remove html tags
def removeHTML(sentence):
    regex = re.compile('<.*?>')
    return re.sub(regex, ' ', sentence)

# Remove URLs
def removeURL(sentence):
    regex = re.compile('http[s]?://\S+')
    return re.sub(regex, ' ', sentence)

# remove numbers, punctuation and any special characters (keep only alphabets)
def onlyAlphabets(sentence):
    regex = re.compile('[^a-zA-Z]')
    return re.sub(regex, ' ', sentence)
In [8]:
sno = nltk.stem.SnowballStemmer('english')    # Initializing stemmer
wordcloud = [[], [], [], [], [], [], []]
all_sentences = []    # All cleaned sentences


for x in range(len(df['Questions'].values)):
    question = df['Questions'].values[x]
    classname = df['Category0'].values[x]

    cleaned_sentence = []
    sentence = removeURL(question) 
    sentence = removeHTML(sentence)
    sentence = onlyAlphabets(sentence)
    sentence = sentence.lower()   

    for word in sentence.split():
        #if word not in stop:
            stemmed = sno.stem(word)
            cleaned_sentence.append(stemmed)
            
            wordcloud[class_names.index(classname)].append(word)
            

    all_sentences.append(' '.join(cleaned_sentence))

# add as column in dataframe
X = all_sentences

Visualization

In [9]:
plt.figure(figsize=(40,40))

for i in range(len(class_names)):
    ax = plt.subplot(len(class_names), 1, i + 1)
    plt.imshow(WordCloud().generate(' '.join(wordcloud[i])))
    plt.title(class_names[i])
    plt.axis("off")

Train test split

In [10]:
# Splitting into train and val set -- 80-20 split

Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size = 0.2)

Tokenization

In [11]:
# Tokenization
vocab = 1500
mlen = 200
 
tokenizer = Tokenizer(num_words = vocab, oov_token = '<UNK>')
tokenizer.fit_on_texts(Xtrain)
 
Xtrain = tokenizer.texts_to_sequences(Xtrain)
Xtrain = pad_sequences(Xtrain, maxlen=mlen)

Xval = tokenizer.texts_to_sequences(Xval)
Xval = pad_sequences(Xval, maxlen=mlen)

The model

In [12]:
# Build and train neural network
embedding_dim = 128
 
model = models.Sequential([
    layers.Embedding(vocab, embedding_dim, input_length = mlen),
    layers.LSTM(128, activation='tanh'),
    layers.Dense(64, activation = 'relu'),
    layers.Dense(32, activation = 'relu'),
    layers.Dense(len(class_names), activation = 'softmax')
])
 
cb = [callbacks.EarlyStopping(patience = 5, restore_best_weights = True)]
model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding (Embedding)        (None, 200, 128)          192000    
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 198       
=================================================================
Total params: 334,118
Trainable params: 334,118
Non-trainable params: 0
_________________________________________________________________
In [13]:
model.compile(optimizer = optimizers.Adam(0.01), loss = losses.CategoricalCrossentropy(), metrics = ['accuracy'])
 
history = model.fit(Xtrain, ytrain, batch_size=64, epochs = 256, validation_data=(Xval, yval), callbacks = cb)
Epoch 1/256
69/69 [==============================] - 1s 22ms/step - loss: 1.1873 - accuracy: 0.5175 - val_loss: 0.6845 - val_accuracy: 0.7699
Epoch 2/256
69/69 [==============================] - 1s 15ms/step - loss: 0.4860 - accuracy: 0.8441 - val_loss: 0.5270 - val_accuracy: 0.8231
Epoch 3/256
69/69 [==============================] - 1s 15ms/step - loss: 0.2873 - accuracy: 0.9080 - val_loss: 0.5683 - val_accuracy: 0.8323
Epoch 4/256
69/69 [==============================] - 1s 15ms/step - loss: 0.1973 - accuracy: 0.9383 - val_loss: 0.5296 - val_accuracy: 0.8341
Epoch 5/256
69/69 [==============================] - 1s 15ms/step - loss: 0.1534 - accuracy: 0.9507 - val_loss: 0.6024 - val_accuracy: 0.8304
Epoch 6/256
69/69 [==============================] - 1s 15ms/step - loss: 0.1228 - accuracy: 0.9619 - val_loss: 0.5924 - val_accuracy: 0.8433
Epoch 7/256
69/69 [==============================] - 1s 15ms/step - loss: 0.1094 - accuracy: 0.9612 - val_loss: 0.5710 - val_accuracy: 0.8433
In [14]:
model.evaluate(Xval, yval)

print("F1 score: ", f1_score(np.argmax(yval.to_numpy(), axis = 1), np.argmax(model.predict(Xval), axis = 1), average = 'weighted'))
35/35 [==============================] - 0s 4ms/step - loss: 0.5270 - accuracy: 0.8231
F1 score:  0.8226130708772106
In [15]:
cm = confusion_matrix(np.argmax(yval.to_numpy(), axis = 1), np.argmax(model.predict(Xval), axis = 1))
cm = cm.astype('int') / cm.sum(axis=1)[:, np.newaxis]

fig = plt.figure(figsize = (10, 10))
ax = fig.add_subplot(111)

for i in range(cm.shape[1]):
    for j in range(cm.shape[0]):
        if cm[i,j] > 0.8:
            clr = "white"
        else:
            clr = "black"
        ax.text(j, i, format(cm[i, j], '.2f'), horizontalalignment="center", color=clr)

_ = ax.imshow(cm, cmap=plt.cm.Blues)
ax.set_xticks(range(len(class_names)))
ax.set_yticks(range(len(class_names)))
ax.set_xticklabels(class_names, rotation = 90)
ax.set_yticklabels(class_names)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

Higher samples in the abbreviation category will help in achieveing better accuracy.

Plotting the metrics

In [16]:
def plot(history, variable, variable2):
    plt.plot(range(len(history[variable])), history[variable])
    plt.plot(range(len(history[variable2])), history[variable2])
    plt.legend([variable, variable2])
    plt.title(variable)
In [17]:
plot(history.history, "accuracy", 'val_accuracy')
In [18]:
plot(history.history, "loss", 'val_loss')

Prediction

In [19]:
x = np.random.randint(0, Xval.shape[0] - 1)

question = df['Questions'].values[x]

print("Question: ", question)

cleaned_text = []

sentence = removeURL(question) 
sentence = removeHTML(sentence)
sentence = onlyAlphabets(sentence)
sentence = sentence.lower()   

for word in sentence.split():
    #if word not in stop:
        stemmed = sno.stem(word)
        cleaned_text.append(stemmed)

cleaned_text = [' '.join(cleaned_text)]

print("Cleaned text: ", cleaned_text[0])

cleaned_text = tokenizer.texts_to_sequences(cleaned_text)
cleaned_text = pad_sequences(cleaned_text, maxlen=mlen)

category = df['Category0'].values[x]  

print("\nTrue category: ", category)

output = model.predict(cleaned_text)[0]

pred = np.argmax(output)

print("\nPredicted category: ", class_names[pred])
print("Probability: ", output[pred])
Question:  What does the Statue of Liberty wear on her feet ?
Cleaned text:  what doe the statu of liberti wear on her feet

True category:  ENTITY

Predicted category:  DESCRIPTION
Probability:  0.4965247

deepc

In [20]:
model.save('question.h5')

!deepCC question.h5
[INFO]
Reading [keras model] 'question.h5'
[SUCCESS]
Saved 'question_deepC/question.onnx'
[INFO]
Reading [onnx model] 'question_deepC/question.onnx'
[INFO]
Model info:
  ir_vesion : 4
  doc       : 
[WARNING]
[ONNX]: lstm (LSTM) has 4 inputs, that aren't connected.
[WARNING]
[ONNX]: terminal (input/output) embedding_input's shape is less than 1. Changing it to 1.
[WARNING]
[ONNX]: terminal (input/output) dense_2's shape is less than 1. Changing it to 1.
WARN (GRAPH): found operator node with the same name (dense_2) as io node.
[INFO]
Running DNNC graph sanity check ...
ERROR (GRAPH): some of graph sequential's node lstm's
               outputs are not connected to other nodes in the graph.
[ERROR]
Failed. Please check your model. graph sequential
operator Cast {
	input embedding_input
	output casted
}
operator embedding {
	input embedding_embeddings_0
	input casted
	output embedding_embedding_lookup_Identity_1_0
}
operator Transpose {
	input embedding_embedding_lookup_Identity_1_0
	output lstm_X
}
operator lstm {
	input lstm_X
	input lstm_W
	input lstm_R
	input lstm_B
	output lstm_Y
	output lstm_Y_h
	output lstm_Y_c
}
operator Squeeze {
	input lstm_Y_h
	output lstm_PartitionedCall_0
}
operator dense {
	input lstm_PartitionedCall_0
	input dense_kernel_0
	output dense0
}
operator Add2 {
	input dense0
	input dense_bias_0
	output biased_tensor_name2
}
operator Relu1 {
	input biased_tensor_name2
	output dense_Relu_0
}
operator dense_1 {
	input dense_Relu_0
	input dense_1_kernel_0
	output dense_10
}
operator Add1 {
	input dense_10
	input dense_1_bias_0
	output biased_tensor_name1
}
operator Relu {
	input biased_tensor_name1
	output dense_1_Relu_0
}
operator dense_2 {
	input dense_1_Relu_0
	input dense_2_kernel_0
	output dense_20
}
operator Add {
	input dense_20
	input dense_2_bias_0
	output biased_tensor_name
}
operator Softmax {
	input biased_tensor_name
	output dense_2
}
weight { float dense_2_kernel_0 [32,6] }
weight { float dense_2_bias_0 [6] }
weight { float dense_1_kernel_0 [64,32] }
weight { float dense_1_bias_0 [32] }
weight { float dense_kernel_0 [128,64] }
weight { float dense_bias_0 [64] }
weight { float lstm_W [1,512,128] }
weight { float lstm_R [1,512,128] }
weight { float lstm_B [1,1024] }
weight { float embedding_embeddings_0 [1500,128] }
input {float embedding_input[1,200]}
output {float dense_2[1,6]}


[INFO]
Writing C++ file 'question_deepC/question.cpp'
ERROR (TYPE INFER): cound not find all nodes for lstm,
WARN (CODEGEN): cound not find all nodes for lstm,
                an instance of LSTM.
                Please check model's sanity and try again.
[INFO]
deepSea model files are ready in 'question_deepC/' 
[RUNNING COMMAND]
g++ -std=c++11 -O3 -fno-rtti -fno-exceptions -I. -I/opt/tljh/user/lib/python3.7/site-packages/deepC-0.13-py3.7-linux-x86_64.egg/deepC/include -isystem /opt/tljh/user/lib/python3.7/site-packages/deepC-0.13-py3.7-linux-x86_64.egg/deepC/packages/eigen-eigen-323c052e1731 "question_deepC/question.cpp" -D_AITS_MAIN -o "question_deepC/question.exe"
[ERROR]
question_deepC/question.cpp: In function ‘std::vector<deepSea::ndarray<float> > deepSea_model(deepSea::ndarray<float>)’:
question_deepC/question.cpp:71:26: error: wrong number of template arguments (2, should be 1)
   71 |   dnnc::Cast<float, float> Cast("Cast");
      |                          ^
In file included from question_deepC/question.cpp:21:
/opt/tljh/user/lib/python3.7/site-packages/deepC-0.13-py3.7-linux-x86_64.egg/deepC/include/operators/Cast.h:31:29: note: provided for ‘template<class T> class dnnc::Cast’
   31 | template <typename T> class Cast : public baseOperator<T, T, T> {
      |                             ^~~~
question_deepC/question.cpp:71:33: error: invalid conversion from ‘const char*’ to ‘int’ [-fpermissive]
   71 |   dnnc::Cast<float, float> Cast("Cast");
      |                                 ^~~~~~
      |                                 |
      |                                 const char*
question_deepC/question.cpp:73:8: error: request for member ‘setAttribute’ in ‘Cast’, which is of non-class type ‘int’
   73 |   Cast.setAttribute ( attr_to, Cast_to );
      |        ^~~~~~~~~~~~
question_deepC/question.cpp:74:41: error: request for member ‘compute’ in ‘Cast’, which is of non-class type ‘int’
   74 |   tensor<float> dnnc_Cast_casted = Cast.compute ( dnnc_embedding_input);
      |                                         ^~~~~~~
question_deepC/question.cpp:77:35: error: wrong number of template arguments (3, should be 2)
   77 |   dnnc::Gather<float, float, float> embedding("embedding");
      |                                   ^
In file included from question_deepC/question.cpp:22:
/opt/tljh/user/lib/python3.7/site-packages/deepC-0.13-py3.7-linux-x86_64.egg/deepC/include/operators/Gather.h:32:7: note: provided for ‘template<class To, class Ti> class dnnc::Gather’
   32 | class Gather : public baseOperator<To, To, Ti> {
      |       ^~~~~~
question_deepC/question.cpp:77:47: error: invalid conversion from ‘const char*’ to ‘int’ [-fpermissive]
   77 |   dnnc::Gather<float, float, float> embedding("embedding");
      |                                               ^~~~~~~~~~~
      |                                               |
      |                                               const char*
question_deepC/question.cpp:78:84: error: request for member ‘compute’ in ‘embedding’, which is of non-class type ‘int’
   78 |   tensor<float> dnnc_embedding_embedding_embedding_lookup_Identity_1_0 = embedding.compute ( dnnc_embedding_embeddings_0, dnnc_Cast_casted);
      |                                                                                    ^~~~~~~

[ERROR]
Couldn't create executable.

usage: deepCC [-h] [--output] [--format] [--verbose] [--profile ]
              [--app_tensors FILE] [--archive] [--bundle] [--debug]
              [--mem_override] [--optimize_peak_mem] [--init_net_model]
              [--input_data_type] [--input_shape] [--cc] [--cc_flags  [...]]
              [--board]
              input