NOTE: This Use Case is not purposed for resource constrained devices.
Resume Screening using Deep Learning¶
Credit: AITS Cainvas Community
Photo by Joe Le Huquet on Dribbble
In this notebook, we need to determine the category of domain from the resume that is provided. The dataset consists of two columns - Resume and Category, where Resume is the input and Category the output.
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
resume = pd.read_csv("https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/UpdatedResumeDataSet.csv")
In [3]:
resume
Out[3]:
In [4]:
#view an example of a resume from our data
resume['Resume'][0]
Out[4]:
In [5]:
resume['Category'].value_counts()
Out[5]:
In [6]:
sns.countplot(y="Category", data=resume)
Out[6]:
In [7]:
#pre-processing of data to remove special characters, hashtags, urls etc
import re
def cleanResume(resumeText):
resumeText = re.sub('http\S+\s*', ' ', resumeText) # remove URLs
resumeText = re.sub('RT|cc', ' ', resumeText) # remove RT and cc
resumeText = re.sub('#\S+', '', resumeText) # remove hashtags
resumeText = re.sub('@\S+', ' ', resumeText) # remove mentions
resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText) # remove punctuations
resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText)
resumeText = re.sub('\s+', ' ', resumeText) # remove extra whitespace
return resumeText
resume['cleaned_resume'] = resume.Resume.apply(lambda x: cleanResume(x))
In [8]:
#data-set after pre-processing
resume
Out[8]:
In [9]:
# Printing an original resume
print('--- Original resume ---')
print(resume['Resume'][0])
In [10]:
# Printing the same resume after text cleaning
print('--- Cleaned resume ---')
print(resume['cleaned_resume'][0])
In [11]:
#Obtaining the most common words
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
import string
from wordcloud import WordCloud
oneSetOfStopWords = set(stopwords.words('english')+['``',"''"])
totalWords =[]
Sentences = resume['cleaned_resume'].values
cleanedSentences = ""
for i in range(len(resume)):
cleanedText = cleanResume(Sentences[i])
cleanedSentences += cleanedText
requiredWords = nltk.word_tokenize(cleanedText)
for word in requiredWords:
if word not in oneSetOfStopWords and word not in string.punctuation:
totalWords.append(word)
wordfreqdist = nltk.FreqDist(totalWords)
mostcommon = wordfreqdist.most_common(50)
print(mostcommon)
In [12]:
#Visualising most common words with Wordcloud
wordcloud = WordCloud( background_color='black',
width=1600,
height=800,
).generate(cleanedSentences)
fig = plt.figure(figsize=(30,20))
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)
fig.savefig("tag.png")
plt.show()