NOTE: This Use Case is not purposed for resource constrained devices.
Resume Screening using Deep Learning¶
Credit: AITS Cainvas Community
Photo by Joe Le Huquet on Dribbble
In this notebook, we need to determine the category of domain from the resume that is provided. The dataset consists of two columns - Resume and Category, where Resume is the input and Category the output.
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
resume = pd.read_csv("https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/UpdatedResumeDataSet.csv")
In [3]:
resume
Out[3]:
In [4]:
#view an example of a resume from our data
resume['Resume'][0]
Out[4]:
In [5]:
resume['Category'].value_counts()
Out[5]:
In [6]:
sns.countplot(y="Category", data=resume)
Out[6]:
In [7]:
#pre-processing of data to remove special characters, hashtags, urls etc
import re
def cleanResume(resumeText):
resumeText = re.sub('http\S+\s*', ' ', resumeText) # remove URLs
resumeText = re.sub('RT|cc', ' ', resumeText) # remove RT and cc
resumeText = re.sub('#\S+', '', resumeText) # remove hashtags
resumeText = re.sub('@\S+', ' ', resumeText) # remove mentions
resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText) # remove punctuations
resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText)
resumeText = re.sub('\s+', ' ', resumeText) # remove extra whitespace
return resumeText
resume['cleaned_resume'] = resume.Resume.apply(lambda x: cleanResume(x))
In [8]:
#data-set after pre-processing
resume
Out[8]:
In [9]:
# Printing an original resume
print('--- Original resume ---')
print(resume['Resume'][0])
In [10]:
# Printing the same resume after text cleaning
print('--- Cleaned resume ---')
print(resume['cleaned_resume'][0])
In [11]:
#Obtaining the most common words
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
import string
from wordcloud import WordCloud
oneSetOfStopWords = set(stopwords.words('english')+['``',"''"])
totalWords =[]
Sentences = resume['cleaned_resume'].values
cleanedSentences = ""
for i in range(len(resume)):
cleanedText = cleanResume(Sentences[i])
cleanedSentences += cleanedText
requiredWords = nltk.word_tokenize(cleanedText)
for word in requiredWords:
if word not in oneSetOfStopWords and word not in string.punctuation:
totalWords.append(word)
wordfreqdist = nltk.FreqDist(totalWords)
mostcommon = wordfreqdist.most_common(50)
print(mostcommon)
In [12]:
#Visualising most common words with Wordcloud
wordcloud = WordCloud( background_color='black',
width=1600,
height=800,
).generate(cleanedSentences)
fig = plt.figure(figsize=(30,20))
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)
fig.savefig("tag.png")
plt.show()
In [13]:
from sklearn.utils import shuffle
# Get features and labels from data and shuffle
features = resume['cleaned_resume'].values
original_labels = resume['Category'].values
labels = original_labels[:]
for i in range(len(resume)):
labels[i] = str(labels[i].lower()) # convert to lowercase
labels[i] = labels[i].replace(" ", "") # use hyphens to convert multi-token labels into single tokens
features, labels = shuffle(features, labels)
# Print example feature and label
print(features[0])
print(labels[0])
In [14]:
# Split into train and test
train_split = 0.8
train_size = int(train_split * len(resume))
train_features = features[:train_size]
train_labels = labels[:train_size]
test_features = features[train_size:]
test_labels = labels[train_size:]
# Print size of each split
print(len(train_labels))
print(len(test_labels))
In [15]:
#tokenize features and labels
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
# Tokenize feature data
vocab_size = 6000
oov_tok = '<>'
feature_tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
feature_tokenizer.fit_on_texts(features)
feature_index = feature_tokenizer.word_index
print(dict(list(feature_index.items())[:20]))
# Print example sequences from train and test datasets
train_feature_sequences = feature_tokenizer.texts_to_sequences(train_features)
test_feature_sequences = feature_tokenizer.texts_to_sequences(test_features)
In [16]:
# Tokenize label data
label_tokenizer = Tokenizer(lower=True)
label_tokenizer.fit_on_texts(labels)
label_index = label_tokenizer.word_index
print(dict(list(label_index.items())))
# Print example label encodings from train and test datasets
train_label_sequences = label_tokenizer.texts_to_sequences(train_labels)
test_label_sequences = label_tokenizer.texts_to_sequences(test_labels)
In [17]:
# Pad sequences for feature data
max_length = 300
trunc_type = 'post'
pad_type = 'post'
train_feature_padded = pad_sequences(train_feature_sequences, maxlen=max_length, padding=pad_type, truncating=trunc_type)
test_feature_padded = pad_sequences(test_feature_sequences, maxlen=max_length, padding=pad_type, truncating=trunc_type)
# Print example padded sequences from train and test datasets
print(train_feature_padded[0])
print(test_feature_padded[0])
In [18]:
#Train a sequential model
# Define the neural network
embedding_dim = 64
model = tf.keras.Sequential([
# Add an Embedding layer expecting input vocab of size 6000, and output embedding dimension of size 64 we set at the top
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=1),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
#tf.keras.layers.Dense(embedding_dim, activation='relu'),
# use ReLU in place of tanh function since they are very good alternatives of each other.
tf.keras.layers.Dense(embedding_dim, activation='relu'),
# Add a Dense layer with 25 units and softmax activation for probability distribution
tf.keras.layers.Dense(26, activation='softmax')
])
model.summary()
In [19]:
# Compile the model and convert train/test data into NumPy arrays
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Features
train_feature_padded = np.array(train_feature_padded)
test_feature_padded = np.array(test_feature_padded)
# Labels
train_label_sequences = np.array(train_label_sequences)
test_label_sequences = np.array(test_label_sequences)
In [20]:
# Train the neural network
num_epochs = 12
history = model.fit(train_feature_padded, train_label_sequences, epochs=num_epochs, validation_data=(test_feature_padded, test_label_sequences), verbose=2)
In [21]:
# print any random example feature and its correct predicted label
print(test_features[5])
print(test_labels[5])
In [22]:
#one more custom prediction example
print(test_features[8])
print(test_labels[8])
In [23]:
#determining test score and accuracy
score = model.evaluate(test_feature_padded, test_label_sequences, verbose=1)
print("Test Score:", score[0])
print("Test Accuracy:", score[1])
In [24]:
#Visualising the model accuracy and loss
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()
In [25]:
model.save("resume_screening.h5")