This notebook uses convolutional neural networks to determine the characters in the given captcha.
A base case implementation of recognizing text in images.
Dataset¶
The link below contains the zipped dataset folder.
The dataset has 1070 captcha images, each with 5 characters. Each character is either a lowercase alphabet or a digit.
The filename is of each image is its corresponding text.
!wget -N "https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/captcha.zip"
!unzip -qo captcha.zip
!rm captcha.zip
import numpy as np
import matplotlib.pyplot as plt
import os
import tensorflow as tf
from PIL import Image, ImageOps
from tensorflow.keras import layers
import random
The captcha in the images contains only lowercase alphabets and digits 0-9.
# The set of characters in the captcha
characters = 'abcdefghijklmnopqrstuvwxyz0123456789'
print("Number of characters: ", len(characters))
# total number of images in dataset
total_samples = len(os.listdir('samples'))
print("Total number of images: ", total_samples)
Viewing few images in the dataset.
num = 4 # NUmber of examples to show
fig = plt.figure()
for i, filename in enumerate(os.listdir('samples')[:num]):
img = Image.open('samples/'+filename)
ax = fig.add_subplot(num, 1, i+1) # Adding subplot
ax.axes.get_xaxis().set_visible(False) # Removing x axis
ax.axes.get_yaxis().set_visible(False) # Removing y axis
ax.imshow(img)
Preprocessing¶
Functions to one hot encode and decode all the characters in the captcha
def encode_label(label):
y_temp = np.zeros((5, len(characters))) # Initializing zero array of required shape
for k, l in enumerate(label):
index = characters.find(l)
y_temp[k, index] = 1
return y_temp
def decode_label(output):
captcha = []
for i in range(5):
captcha.append(characters[np.argmax(output[i])]) # Find index of max value and map it to characters array
return ''.join(captcha)
encoded = encode_label('22efd')
decoded = decode_label(encoded)
print("Encoded: ", encoded)
print("Decoded: ", decoded)
Input¶
The input X is the numpy array representation of each image. The images have 4 channels which are reduced to 1 by converting the image to greyscale. They are then normalized so that every pixel has a value between 0 and 1.
Output¶
The output y has 5 arrays representing the five characters required as output. Each of these arrays has a one hot encoded array of length 36 representing the ith (1 to 5) character for each input image.
X, y = [], np.zeros((5, total_samples, len(characters)))
images_list = os.listdir('samples')
for i in range(total_samples):
img = Image.open('samples/'+images_list[i])
img = ImageOps.grayscale(img) # Convert to greyscale
img = np.asarray(img)/255 # Normalization; shape of img = (50,200)
img = np.reshape(img, (img.shape[0], img.shape[1], 1)) # Adding another dimension to represent channel.
label = images_list[i][:-4] # The filename contains the captcha text.
# One hot encoding
y[:, i] = encode_label(label) # Write to y array as output of ith image
X.append(img) # Appending to X
X = np.asarray(X)
y = np.asarray(y)
print("Shape of X: ", X.shape)
print("Shape of y: ", y.shape)
Using a train-test split of 90-10
train_count = int(total_samples * 0.9)
test_count = total_samples-train_count
print("Images in train set: ", train_count)
print("Images in test set: ", test_count)
#Splitting the X and y arrays into train and test
xtrain, xtest = X[:train_count], X[train_count:]
ytrain, ytest = y[:, :train_count], y[:, train_count:]
print("Train shapes: ", xtrain.shape, ytrain.shape)
print("Test shapes: ", xtest.shape, ytest.shape)
Model¶
The model has convolution and max pooling layers that branch into 5 dense networks, one for each character.
def create_model():
input_layer = layers.Input(shape = xtrain[0].shape)
conv1 = layers.Conv2D(8, (3, 3), padding = 'same', activation = 'relu')(input_layer)
max1 = layers.MaxPooling2D(padding = 'same', )(conv1)
conv2 = layers.Conv2D(16, (3, 3), padding = 'same', activation = 'relu')(max1)
max2 = layers.MaxPooling2D(padding = 'same', )(conv2)
conv3 = layers.Conv2D(32, (3, 3), padding = 'same', activation = 'relu')(max2)
max3 = layers.MaxPooling2D(padding = 'same', )(conv3)
batch3 = layers.BatchNormalization()(max3)
flat = layers.Flatten()(batch3)
output_layers = []
for _ in range(5): # For all the 5 output characters
dense1 = tf.keras.layers.Dense(32, activation='relu')(flat)
drop = tf.keras.layers.Dropout(0.6)(dense1)
dense2 = tf.keras.layers.Dense(len(characters), activation='sigmoid')(drop)
output_layers.append(dense2)
model = tf.keras.models.Model(input_layer, output_layers)
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
return model
model = create_model()
model.summary()
# Evaluating an untrained model.
eval = model.evaluate(xtest,[ytest[0], ytest[1], ytest[2], ytest[3], ytest[4]],verbose=1)
Training the model¶
Since the model has 5 different dense networks, one for each character, the y parameter of the model.fit() method is given a list of 5 arrays.
Training the model for 128 epochs.
history = model.fit(xtrain, [ytrain[0], ytrain[1], ytrain[2], ytrain[3], ytrain[4]], batch_size=32, epochs=128, verbose=1)
# Evaluating the trained model
eval = model.evaluate(xtest,[ytest[0], ytest[1], ytest[2], ytest[3], ytest[4]],verbose=1)
print("\nAccuracies of the 5 dense networks: ")
for i in range(-5, 0):
print(eval[i])
Plot¶
history_keys = list(history.history.keys())
history_keys
# Plotting the overall loss and losses of the 5 dense networks
for i in range(6):
plt.plot(history.epoch, history.history[history_keys[i]])
plt.title("Losses")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend(history_keys[:6])
plt.show()
# Plotting the accuracies of the 5 dense networks
for i in range(6,11):
plt.plot(history.epoch, history.history[history_keys[i]])
plt.title("Accuracies")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend(history_keys[6:11])
plt.show()
def predict(file):
img = Image.open(file)
img = ImageOps.grayscale(img)
img = np.asarray(img)/255
img = np.reshape(img, (1, img.shape[0], img.shape[1], 1))
output = np.asarray(model.predict(img))
output = output.squeeze() # Remove empty dimensions
return decode_label(output)
#i = random.randint(1, total_samples) # Pick one from all the image samples
i = random.randint(train_count, total_samples) # Pick one from only the test set
filename = 'samples/' + images_list[i]
plt.imshow(Image.open(filename))
plt.axis('off') # Removing the axes
print("Predicted: ", predict(filename))
deepC¶
model.save('captcha_model.h5')
!deepCC captcha_model.h5
def input_to_deepC(file):
img = Image.open(file)
img = ImageOps.grayscale(img)
img = np.asarray(img)/255
img = np.reshape(img, (1, img.shape[0], img.shape[1], 1))
return img
sample = xtest[1]
np.savetxt('sample.data', input_to_deepC(''))