NOTE: This Use Case is not purposed for resource constrained devices.
Neural Machine Translation (English To French)¶
Credit: AITS Cainvas Community
Photo by Olivia G. Sutanto for Google on Dribbble
Language Translation is a key service that is needed by the people across the whole globe. A Neural Machine Translator to translate English to French using a seq2seq NLP model which uses a birectional LSTM neural network model to translate English To French.
Dataset for training the model was taken from Kaggle. Here is the link
Import Libraries¶
In [1]:
import nltk
# download nltk packages
nltk.download('punkt')
# download stopwords
nltk.download("stopwords")
Out[1]:
In [2]:
from collections import Counter
import operator
import plotly.express as px
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import nltk
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, TimeDistributed, RepeatVector, Embedding, Input, LSTM, Conv1D, MaxPool1D, Bidirectional
from tensorflow.keras.models import Model
Import Datasets¶
In [3]:
# load the data
df_english = pd.read_csv('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/small_vocab_en.csv',
sep = '/t', names = ['english'], engine='python')
df_french = pd.read_csv('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/small_vocab_fr.csv',
sep = '/t', names = ['french'], engine='python')
Visualizing both the data frames¶
In [4]:
df_english
Out[4]:
In [5]:
df_french
Out[5]:
In [6]:
df_english.info()
In [7]:
df_french.info()
Concatinating both English and French dataframe into a single DataFrame¶
In [8]:
df = pd.concat([df_english, df_french], axis = 1)
In [9]:
df
Out[9]:
In [10]:
#printing total records
print('Total English records = {}'.format(len(df['english'])))
print('Total French records = {}'.format(len(df['french'])))
Performing Data Cleaning¶
In [12]:
# function to remove punctuations
def remove_punc(x):
return re.sub('[!#?,.:";]', '', x)
In [13]:
df['french'] = df['french'].apply(remove_punc)
df['english'] = df['english'].apply(remove_punc)
In [14]:
english_words = []
french_words = []
Finding out total unique words in our English and French Vocabulary¶
In [15]:
def get_unique_words(x, word_list):
for word in x.split():
if word not in word_list:
word_list.append(word)
df['english'].apply(lambda x: get_unique_words(x, english_words))
total_english_words = len(english_words)
total_english_words
Out[15]:
In [16]:
# number of unique words in french
df['french'].apply(lambda x: get_unique_words(x, french_words))
total_french_words = len(french_words)
total_french_words
Out[16]:
VISUALIZE CLEANED UP DATASET¶
In [17]:
# Obtain list of all words in the dataset
words = []
for i in df['english']:
for word in i.split():
words.append(word)
In [18]:
# Obtain the total count of words
english_words_counts = Counter(words)
In [19]:
# sort the dictionary by values
english_words_counts = sorted(english_words_counts.items(), key = operator.itemgetter(1), reverse = True)
In [20]:
#finding out each word count in our data
english_words_counts
Out[20]:
In [21]:
# append the values to a list for visualization purposes
english_words = []
english_counts = []
for i in range(len(english_words_counts)):
english_words.append(english_words_counts[i][0])
english_counts.append(english_words_counts[i][1])
In [22]:
# Plot barplot using plotly
fig = px.bar(x = english_words, y = english_counts)
fig.show()
In [23]:
# plot the word cloud for text that is Real
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000, width = 1600, height = 800 ).generate(" ".join(df.english))
plt.imshow(wc, interpolation = 'bilinear')
Out[23]:
In [24]:
#Tokenized form of first record
df.english[0]
nltk.word_tokenize(df.english[0])
Out[24]:
In [25]:
# Maximum length (number of words) per record. We will need it later for embeddings
maxlen_english = -1
for doc in df.english:
tokens = nltk.word_tokenize(doc)
if(maxlen_english < len(tokens)):
maxlen_english = len(tokens)
print("The maximum number of words in any record = ", maxlen_english)
Doing similar operations on French data¶
In [26]:
words = []
for i in df['french']:
for word in i.split():
words.append(word)
In [27]:
french_words_counts = Counter(words)
In [28]:
# sort the dictionary by values and printing
french_words_counts = sorted(french_words_counts.items(), key = operator.itemgetter(1), reverse = True)
french_words_counts
Out[28]:
In [29]:
# append the values to a list for visuaization purpose
french_words = []
french_counts = []
for i in range(len(french_words_counts)):
french_words.append(french_words_counts[i][0])
french_counts.append(french_words_counts[i][1])
fig = px.bar(x = french_words, y = french_counts)
fig.show()
In [30]:
# plot the word cloud for French
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800).generate(" ".join(df.french))
plt.imshow(wc, interpolation = 'bilinear')
Out[30]:
In [31]:
# Maximum length (number of words) per records. We will need it later for embeddings
maxlen_french = -1
for doc in df.french:
tokens = nltk.word_tokenize(doc)
if(maxlen_french < len(tokens)):
maxlen_french = len(tokens)
print("The maximum number of words in any record = ", maxlen_french)
Preparing the Data by Performing Tokenization and Padding¶
In [32]:
def tokenize_and_pad(x, maxlen):
# a tokenier to tokenize the words and create sequences of tokenized words
tokenizer = Tokenizer(char_level = False)
tokenizer.fit_on_texts(x)
sequences = tokenizer.texts_to_sequences(x)
padded = pad_sequences(sequences, maxlen = maxlen, padding = 'post')
return tokenizer, sequences, padded
In [33]:
# tokenize and padding to the data
x_tokenizer, x_sequences, x_padded = tokenize_and_pad(df.english, maxlen_english)
y_tokenizer, y_sequences, y_padded = tokenize_and_pad(df.french, maxlen_french)
In [34]:
# Total vocab size, since we added padding we add 1 to the total word count
english_vocab_size = total_english_words + 1
print("Complete English Vocab Size:", english_vocab_size)
In [35]:
# Total vocab size, since we added padding we add 1 to the total word count
french_vocab_size = total_french_words + 1
print("Complete French Vocab Size:", french_vocab_size)
In [36]:
print("The tokenized version for document\n", df.english[-1:].item(),"\n is : ", x_padded[-1:])
In [37]:
print("The tokenized version for document\n", df.french[-1:].item(),"\n is : ", y_padded[-1:])
In [38]:
# function to obtain the text from padded variables
def pad_to_text(padded, tokenizer):
id_to_word = {id: word for word, id in tokenizer.word_index.items()}
id_to_word[0] = ''
return ' '.join([id_to_word[j] for j in padded])
In [39]:
# Otaining the actual text back in original form.
pad_to_text(y_padded[0], y_tokenizer)
Out[39]:
Defining the model¶
In [40]:
# Train test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_padded, y_padded, test_size = 0.1)
In [41]:
# Sequential Model
model = Sequential()
# embedding layer
model.add(Embedding(english_vocab_size, 256, input_length = maxlen_english, mask_zero = True))
# encoder
model.add(LSTM(256))
# decoder
# repeatvector repeats the input for the desired number of times to change
# 2D-array to 3D array. For example: (1,256) to (1,23,256)
model.add(RepeatVector(maxlen_french))
model.add(LSTM(256, return_sequences= True ))
model.add(TimeDistributed(Dense(french_vocab_size, activation ='softmax')))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()
In [42]:
# change the shape of target from 2D to 3D
y_train = np.expand_dims(y_train, axis = 2)
y_train.shape
Out[42]:
In [43]:
# train the model
history = model.fit(x_train, y_train, batch_size=1024, validation_split= 0.1, epochs=25)
In [44]:
print(history.history.keys())
Visualizing the train and test metrics¶
In [45]:
import matplotlib.pyplot as plt
%matplotlib inline
In [46]:
# summarize history for Accuracy
fig_acc = plt.figure(figsize=(10, 10))
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
In [47]:
# summarize history for Loss
fig_acc = plt.figure(figsize=(10, 10))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
In [48]:
# save the model
model.save("eng2french.h5")
ASSESS TRAINED MODEL PERFORMANCE¶
In [49]:
# function to make prediction
def prediction(x, x_tokenizer = x_tokenizer, y_tokenizer = y_tokenizer):
predictions = model.predict(x)[0]
id_to_word = {id: word for word, id in y_tokenizer.word_index.items()}
id_to_word[0] = ''
return ' '.join([id_to_word[j] for j in np.argmax(predictions,1)])
In [50]:
# Printing the English text with corrent French Translation and predicted French Translation
for i in range(5):
print('Original English word - {}\n'.format(pad_to_text(x_test[i], x_tokenizer)))
print('Original French word - {}\n'.format(pad_to_text(y_test[i], y_tokenizer)))
print('Predicted French word - {}\n\n\n\n'.format(prediction(x_test[i:i+1])))
Compliling the model to DeepC for production¶
In [ ]:
!deepCC eng2french.h5