NOTE: This Use Case is not purposed for resource constrained devices.
Visual wake word detection is the classification of images into 2 classes - with person(s) or without. Just as audio wake word systems respond to a specific phrase, visual wake word systems respond to the presence of humans in the frame.
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers, callbacks, optimizers
import tensorflow.keras
import os
import random
from PIL import Image
In [2]:
!wget -N https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/vww.zip
!unzip -qo vww.zip
!rm vww.zip
The dataset folder has two sub-folders - person and notperson containing images of respective types.
In [3]:
data_dir = 'vww/'
print("Number of samples")
for f in os.listdir(data_dir + '/'):
if os.path.isdir(data_dir + '/' + f):
print(f, " : ", len(os.listdir(data_dir + '/' + f +'/')))
It is a balanced dataset.
In [4]:
batch_size = 64
print("Training set")
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
data_dir,
image_size=(96, 96),
validation_split=0.2,
subset="training",
seed=113,
batch_size=batch_size)
print("Validation set")
val_ds = tf.keras.preprocessing.image_dataset_from_directory(
data_dir,
image_size=(96, 96),
validation_split=0.2,
subset="validation",
seed=113,
batch_size=batch_size)
Define the class_names for use later.
In [5]:
class_names = train_ds.class_names
print(class_names)
Visualization¶
In [6]:
num_samples = 4 # the number of samples to be displayed in each class
for x in class_names:
plt.figure(figsize=(10, 10))
filenames = os.listdir(data_dir + x)
for i in range(num_samples):
ax = plt.subplot(1, num_samples, i + 1)
img = Image.open(data_dir + x + '/' + filenames[i])
plt.imshow(img)
plt.title(x)
plt.axis("off")
In [7]:
print("Shape of one training batch")
for image_batch, labels_batch in train_ds:
input_shape = image_batch[0].shape
print("Input: ", image_batch.shape)
print("Labels: ", labels_batch.shape)
break
Normalizing the pixel values¶
Pixel values are now integers between 0 and 255. Changing them to the range [0, 1] for faster convergence.
In [8]:
# Normalizing the pixel values
normalization_layer = layers.experimental.preprocessing.Rescaling(1./255)
train_ds = train_ds.map(lambda x, y: (normalization_layer(x), y))
val_ds = val_ds.map(lambda x, y: (normalization_layer(x), y))
The model¶
Transfer learning
In [9]:
base_model = tf.keras.applications.MobileNetV2(weights= 'imagenet', input_shape=input_shape, include_top=False) # False, do not include the classification layer of the model
base_model.trainable = False
inputs = tf.keras.Input(shape=input_shape)
x = base_model(inputs, training=False)
x = tf.keras.layers.GlobalAveragePooling2D()(x)
outputs = tf.keras.layers.Dense(len(class_names), activation = 'softmax')(x) # Add own classififcation layer
model = tf.keras.Model(inputs, outputs)
cb = [callbacks.EarlyStopping(monitor = 'val_loss', patience = 5, restore_best_weights = True)]
model.summary()
In [10]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizers.Adam(0.01), metrics=['accuracy'])
history = model.fit(train_ds, validation_data = val_ds, epochs=32, callbacks = cb)
In [11]:
model.evaluate(val_ds)
Out[11]:
Plotting the metrics¶
In [12]:
def plot(history, variable, variable2):
plt.plot(range(len(history[variable])), history[variable])
plt.plot(range(len(history[variable2])), history[variable2])
plt.title(variable)
In [13]:
plot(history.history, "accuracy", 'val_accuracy')
In [14]:
plot(history.history, "loss", "val_loss")
Prediction¶
In [15]:
# pick random test data sample from one batch
x = random.randint(0, batch_size - 1)
for i in val_ds.as_numpy_iterator():
img, label = i
plt.axis('off') # remove axes
plt.imshow(img[x]) # shape from (64, 256, 256, 3) --> (256, 256, 3)
output = model.predict(np.expand_dims(img[x],0)) # getting output; input shape (256, 256, 3) --> (1, 256, 256, 3)
pred = np.argmax(output[0]) # finding max
print("Predicted: ", class_names[pred]) # Picking the label from class_names base don the model output
print("True: ", class_names[label[x]])
print("Probability: ", output[0][pred])
break
deepC¶
In [16]:
model.save('visual_wake_word.h5')
In [17]:
!deepCC visual_wake_word.h5
In [ ]: