Language identification in text¶
Credit: AITS Cainvas Community
Photo by Pendar Yousefi on Dribbble
Language detection refers to determining the language that the given text is written in.
This categorization becomes important when the language of the input data is not assumed. For example, the detect language feature of Google translate detects the language of the input text before translating it.
import numpy as np
import pandas as pd
from keras import layers, optimizers, losses, callbacks, models
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
import matplotlib.pyplot as plt
import random
The dataset¶
WiLI-2018, the Wikipedia language identification benchmark dataset, contains 235000 paragraphs of 235 languages. After data selection and preprocessing 22 languages from the original dataset were selected to create the current dataset.
The dataset is a CSV file consisting of 1000 text samples from 22 languages each, making it a total of 22k sample sin the dataset.
df = pd.read_csv('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/language_dataset.csv')
df
df['language'].value_counts()
This is a balanced dataset!
languages_onehot = pd.get_dummies(df['language'])
languages = languages_onehot.columns
for column in list(languages):
df[column] = languages_onehot[column]
df
Visualization¶
A peek into the samples from different languages!
for language in languages:
df_temp = df[df['language'] == language]
print("\n", language, "\n", df_temp['Text'].to_list()[0], sep = "")
If we look closer into the samples, we can see that there are few English words in the Urdu sample and transliterated words in Thai too! This may lead to ambiguities during prediction.
# Splitting into train, val and test set -- 80-10-10 split
# First, an 80-20 split
train_df, val_test_df = train_test_split(df, test_size = 0.2, random_state = 3)
# Then split the 20% into half
val_df, test_df = train_test_split(val_test_df, test_size = 0.5, random_state = 3)
print("Number of samples in...")
print("Training set: ", len(train_df))
print("Validation set: ", len(val_df))
print("Testing set: ", len(test_df))
Bag of Words¶
cv = CountVectorizer(ngram_range = (1,1), max_features=20000)
train_bow = cv.fit_transform(train_df['Text'])
val_bow = cv.transform(val_df['Text'])
test_bow = cv.transform(test_df['Text'])
Xtrain = train_bow.toarray()
ytrain = train_df[languages]
Xval = val_bow.toarray()
yval = val_df[languages]
ytest = test_df[languages]
Xtest = test_bow.toarray()
Standardization¶
std_scaler = StandardScaler()
Xtrain = std_scaler.fit_transform(Xtrain)
Xval = std_scaler.transform(Xval)
Xtest = std_scaler.transform(Xtest)
The model¶
model = models.Sequential([
layers.Dense(128, activation = 'relu', input_shape = Xtrain[0].shape),
layers.Dense(64, activation = 'relu'),
layers.Dense(32, activation = 'relu'),
layers.Dense(len(languages), activation = 'softmax')
])
cb = [callbacks.EarlyStopping(patience = 5, restore_best_weights = True)]
model.compile(optimizer = optimizers.Adam(0.001), loss = losses.CategoricalCrossentropy(), metrics = ['accuracy'])
history = model.fit(Xtrain, ytrain, validation_data = (Xval, yval), epochs = 64, callbacks = cb)
model.compile(optimizer = optimizers.Adam(0.0001), loss = losses.CategoricalCrossentropy(), metrics = ['accuracy'])
history1 = model.fit(Xtrain, ytrain, validation_data = (Xval, yval), epochs = 64, callbacks = cb)
model.evaluate(Xtest, ytest)
cm = confusion_matrix(np.argmax(ytest.to_numpy(), axis = 1), np.argmax(model.predict(Xtest), axis = 1))
cm = cm.astype('int') / cm.sum(axis=1)[:, np.newaxis]
fig = plt.figure(figsize = (10, 10))
ax = fig.add_subplot(111)
for i in range(cm.shape[1]):
for j in range(cm.shape[0]):
if cm[i,j] > 0.8:
clr = "white"
else:
clr = "black"
ax.text(j, i, format(cm[i, j], '.2f'), horizontalalignment="center", color=clr)
_ = ax.imshow(cm, cmap=plt.cm.Blues)
ax.set_xticks(range(len(languages)))
ax.set_yticks(range(len(languages)))
ax.set_xticklabels(languages, rotation = 90)
ax.set_yticklabels(languages)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
Plotting the metrics¶
def plot(history1, history2, variable1, variable2):
# combining metrics from both trainings
var1_history = history1[variable1]
var1_history.extend(history2[variable1])
var2_history = history1[variable2]
var2_history.extend(history2[variable2])
# plotting them
plt.plot(range(len(var1_history)), var1_history)
plt.plot(range(len(var2_history)), var2_history)
plt.legend([variable1, variable2])
plt.title(variable1)
plot(history.history, history1.history, "accuracy", 'val_accuracy')
plot(history.history, history1.history, "loss", 'val_loss')
Prediction¶
# pick random test data sample from one batch
x = random.randint(0, len(Xtest) - 1)
print("Sample:\n", test_df['Text'].to_numpy()[x], sep = "")
input = cv.transform([test_df['Text'].to_numpy()[x]]).toarray()
output = model.predict(input)
pred = np.argmax(output[0]) # finding max
print("\nPredicted: ", languages[pred]) # Picking the label from class_names based on the model output
print("Probability: ", output[0][pred])
output_true = test_df['language'].to_numpy()[x]
print("\nTrue: ", output_true)
deepC¶
model.save('language.h5')
!deepCC language.h5
# pick random test data sample from one batch
x = random.randint(0, len(Xtest) - 1)
print("Sample:\n", test_df['Text'].to_numpy()[x], "\n", sep = "")
input = cv.transform([test_df['Text'].to_numpy()[x]]).toarray()
np.savetxt('sample.data', input) # xth sample into text file
# run exe with input
!language_deepC/language.exe sample.data
# show predicted output
nn_out = np.loadtxt('deepSea_result_1.out')
output = model.predict(input)
pred = np.argmax(output[0]) # finding max
print("\nPredicted: ", languages[pred]) # Picking the label from class_names based on the model output
print("Probability: ", output[0][pred])
output_true = test_df['language'].to_numpy()[x]
print("\nTrue: ", output_true)