Recognizing the spoken digit¶

Use mel spectogram images of the audio samples to recognize the spoken digit. The mel scale is a closer representation of the human audio perception than the standard scale.

The same model can be extended to recognize generally spoken words.

import torch
from torch.utils.data import Dataset, random_split, DataLoader, TensorDataset
import torchvision
from torchvision.datasets.utils import download_url
import torch.nn as nn
import torch.nn.functional as F
import os
import librosa
import pandas as pd
import numpy as np
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import librosa.display
import sklearn
import matplotlib
import csv
from PIL import Image
from sklearn.metrics import f1_score
import IPython.display as ipd
import random

The below cell contains the zip file for the dataset and other files required in this notebook.

!wget -N https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/spoken_digit_melimages.zip
!unzip -qo spoken_digit_melimages.zip

--2021-09-08 12:12:10--  https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/spoken_digit_melimages.zip
Resolving cainvas-static.s3.amazonaws.com (cainvas-static.s3.amazonaws.com)... 52.219.158.59
Connecting to cainvas-static.s3.amazonaws.com (cainvas-static.s3.amazonaws.com)|52.219.158.59|:443... connected.
HTTP request sent, awaiting response... 304 Not Modified
File ‘spoken_digit_melimages.zip’ not modified on server. Omitting download.

Data¶

digit = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']

The audio dataset is a subset of the Tensorflow speech commands dataset. The subset consisting of digits is too large to be included here.

Each sample is a 1 second long mono audio recorded at 8000 hz.

Here is one sample of each digit.

# Random sample selection
digit_audio_sample = ['dataset_sample_0.wav', 'dataset_sample_1.wav', 'dataset_sample_2.wav', 'dataset_sample_3.wav', 'dataset_sample_4.wav', 'dataset_sample_5.wav', 'dataset_sample_6.wav', 'dataset_sample_7.wav', 'dataset_sample_8.wav', 'dataset_sample_9.wav']
i = random.randint(0, 9)

# Selecting the sample by label
#i = 4    # Label = 4

data, sr = librosa.load('melimages/'+digit_audio_sample[i])

fig = plt.figure()
# Two subplots
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)

print("Label: ", i, '\n')
print("Sampling rate: ", sr, "\n")

# mel spectogram
S = librosa.feature.melspectrogram(y=data, sr=sr)
librosa.display.specshow(librosa.power_to_db(S, ref=np.max), x_axis='time', y_axis='mel', ax = ax1)
# waveplot
librosa.display.waveplot(np.array(data), sr=sr, ax = ax2)

ipd.Audio(data = data, rate = sr)

Label:  8 

Sampling rate:  22050

/opt/tljh/user/lib/python3.7/site-packages/librosa/display.py:974: MatplotlibDeprecationWarning: The 'basey' parameter of __init__() has been renamed 'base' since Matplotlib 3.3; support for the old name will be dropped two minor releases later.
  scaler(mode, **kwargs)
/opt/tljh/user/lib/python3.7/site-packages/librosa/display.py:974: MatplotlibDeprecationWarning: The 'linthreshy' parameter of __init__() has been renamed 'linthresh' since Matplotlib 3.3; support for the old name will be dropped two minor releases later.
  scaler(mode, **kwargs)

Extracting data¶

No need to execute the following commented code, extracted files included in dataset.

Execution of the data extraction cells below requires the tensorflow speech commands dataset to be downloaded (~1GB).

Link to download the audio dataset : http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz

Creating the reference csv file.¶

'''

# Creating a reference csv file

with open("Spoken_digit.csv", 'w') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(["File", "Label"])
    for x in digit:
        if os.path.isdir('/content/data/'+x):
            for name in os.listdir('/content/data/'+x):
                if os.path.isfile('/content/data/'+x+"/"+name):
                    csvwriter.writerow([x+'/'+name, x])

# shuffle 
df = pd.read_csv('Spoken_digit.csv')
df = df.sample(frac=1)
df.to_csv('Spoken_digit.csv', index = False)

'''

'\n\n# Creating a reference csv file\n\nwith open("Spoken_digit.csv", \'w\') as csvfile:\n    csvwriter = csv.writer(csvfile)\n    csvwriter.writerow(["File", "Label"])\n    for x in digit:\n        if os.path.isdir(\'/content/data/\'+x):\n            for name in os.listdir(\'/content/data/\'+x):\n                if os.path.isfile(\'/content/data/\'+x+"/"+name):\n                    csvwriter.writerow([x+\'/\'+name, x])\n\n# shuffle \ndf = pd.read_csv(\'Spoken_digit.csv\')\ndf = df.sample(frac=1)\ndf.to_csv(\'Spoken_digit.csv\', index = False)\n\n'

Extracting the mel spectogram images from the audio samples.¶

'''

# -- Extraction can take about 8 hours

# Making folders to store the extracted images

path = ''
os.makedirs(path+'/melimages')
for i in digit:
    os.makedirs(path+'/melimages/'+i)
    os.makedirs(path+'/melimages100/'+i)
    os.makedirs(path+'/melimages200/'+i)

for i in range(0, len(sp)):
    print(i)    # Print to keep track of which file is being processed

    f = sp.loc[i]
    
    name = f.File
    data, sr = librosa.load('/content/data/'+name)
    
    fig = plt.figure(figsize=[1,1])
    ax = fig.add_subplot(111)
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)
    ax.set_frame_on(False)
    
    S = librosa.feature.melspectrogram(y=data, sr=sr)
    librosa.display.specshow(librosa.power_to_db(S, ref=np.max), x_axis='time', y_axis='mel')
    
    file  = path+'/melimages/'+str(name[:-4]) + '.jpg'
    plt.savefig(file, bbox_inches='tight',pad_inches=0)

    plt.close()
    
'''

"\n\n# -- Extraction can take about 8 hours\n\n# Making folders to store the extracted images\n\npath = ''\nos.makedirs(path+'/melimages')\nfor i in digit:\n    os.makedirs(path+'/melimages/'+i)\n    os.makedirs(path+'/melimages100/'+i)\n    os.makedirs(path+'/melimages200/'+i)\n\nfor i in range(0, len(sp)):\n    print(i)    # Print to keep track of which file is being processed\n\n    f = sp.loc[i]\n    \n    name = f.File\n    data, sr = librosa.load('/content/data/'+name)\n    \n    fig = plt.figure(figsize=[1,1])\n    ax = fig.add_subplot(111)\n    ax.axes.get_xaxis().set_visible(False)\n    ax.axes.get_yaxis().set_visible(False)\n    ax.set_frame_on(False)\n    \n    S = librosa.feature.melspectrogram(y=data, sr=sr)\n    librosa.display.specshow(librosa.power_to_db(S, ref=np.max), x_axis='time', y_axis='mel')\n    \n    file  = path+'/melimages/'+str(name[:-4]) + '.jpg'\n    plt.savefig(file, bbox_inches='tight',pad_inches=0)\n\n    plt.close()\n    \n"

Note¶

The extracted mel images and the csv file are directly available as a dataset at the download link at the start of the notebook.

Folder - melimages

The dataset folder contains:

* 10 folders, one for each digit. These folders have mel spectogram images of corresponding audio samples.

* 10 samples, one for each digit, from the dataset as shown above. This helps the notebook user understand the dataset better as the original samples are not included here.

* 4 user recorded external samples not present in dataset (train or val)

Spoken_digit.csv : A csv file that has two columns

* File - Name of file/path to file in dataset
* Label - Label of the sample

# Spoken_digit.csv

spokendigit = pd.read_csv('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/Spoken_digit.csv')
spokendigit

spokendigit['Label'].value_counts()

seven    2377
zero     2376
two      2373
four     2372
one      2370
six      2369
nine     2364
five     2357
three    2356
eight    2352
Name: Label, dtype: int64

This is a balanced dataset with ~2360 samples per digit.

Mel dataset¶

class SpokenDigit(Dataset):
    def __init__(self, file = None, rootdir = None, transform = None):
        self.df = pd.read_csv(file)
        self.rootdir = rootdir    # root directory for the images
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i):
        row = self.df.loc[i]
        fname, label = row['File'], row['Label']
        ik = self.rootdir+fname[:-4]+'.jpg'
        img = Image.open(ik)
        if self.transform:
            img = self.transform(img)
        return img, torch.tensor(digit.index(label))    # return image tensor and numeric label
    
    def getsr(self, i):
        row = self.df.loc[i]
        fname, label = row['File'], row['Label']
        _, sr = librosa.load(self.rootdir+fname)
        return sr

meldset = SpokenDigit('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/Spoken_digit.csv', 'melimages/', transforms.Compose([transforms.ToTensor()]))

# train - val split of 90-10

size = len(meldset)
val_size = int(0.1 * size)
train_size = size - val_size 

train_dset, val_dset = random_split(meldset, [train_size, val_size])

print("Number of samples in train set: ", train_size)
print("Number of samples in validation set: ", val_size)

Number of samples in train set:  21300
Number of samples in validation set:  2366

train_dl = DataLoader(train_dset, 64, True, num_workers=6, pin_memory=True)
val_dl = DataLoader(val_dset, 64, num_workers=6, pin_memory=True)

Model¶

class SpokenDigitModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.network = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),

            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),

            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d(1),

            nn.Flatten(), 
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 10),
            nn.Softmax(dim = 1)
        )

    def forward(self, x):
        return self.network(x)

    def training_step(self, batch):
        inputs, labels = batch
        outputs = self(inputs)
        loss = F.cross_entropy(outputs, labels)    # cros entropy loss
        return loss

    def validation_step(self, batch):
        inputs, labels = batch
        outputs = self(inputs)
        loss = F.cross_entropy(outputs, labels)
        _, pred = torch.max(outputs, 1)
        accuracy = torch.tensor(torch.sum(pred==labels).item()/len(pred))    # calculate accuracy
        return [loss.detach(), accuracy.detach()]

def evaluate(model, loader):
    model.eval()
    outputs = [model.validation_step(batch) for batch in loader]
    outputs = torch.tensor(outputs).T
    loss, accuracy = torch.mean(outputs, dim=1)
    return {"loss" : loss.item(), "accuracy" : accuracy.item()}

def fit(model, train_loader, val_loader, epochs, lr, optimizer_function = torch.optim.Adam):
    history = []
    optimizer = optimizer_function(model.parameters(), lr)
    
    for epoch in range(epochs):
        print("Epoch ", epoch)
        #Train
        model.train()

        for batch in train_loader:
            loss = model.training_step(batch)
            loss.backward()

            optimizer.step()
            optimizer.zero_grad()
            
        #Validate
        result = evaluate(model, val_loader)
        
        print(" Val_loss: ", result['loss'], " Accuracy: ", result['accuracy'])
        
        history.append(result)         
    return history

Training the model¶

model = SpokenDigitModel()
history = []

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
# evaluating on an untrained model
evaluate(model, val_dl)
from torchsummary import summary
import torch
import torchvision
from torch import nn
from torchvision import models
print(model)
count_parameters(model)

SpokenDigitModel(
  (network): Sequential(
    (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (9): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (10): ReLU()
    (11): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (12): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU()
    (14): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (15): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (16): ReLU()
    (17): AdaptiveAvgPool2d(output_size=1)
    (18): Flatten(start_dim=1, end_dim=-1)
    (19): Linear(in_features=256, out_features=128, bias=True)
    (20): ReLU()
    (21): Linear(in_features=128, out_features=64, bias=True)
    (22): ReLU()
    (23): Linear(in_features=64, out_features=10, bias=True)
    (24): Softmax(dim=1)
  )
)

581994

history.append(fit(model, train_dl, val_dl, 16, 0.001))

Epoch  0
 Val_loss:  1.9822828769683838  Accuracy:  0.47592905163764954
Epoch  1
 Val_loss:  1.7117048501968384  Accuracy:  0.7501361966133118
Epoch  2
 Val_loss:  1.6016093492507935  Accuracy:  0.8588300943374634
Epoch  3
 Val_loss:  1.5781888961791992  Accuracy:  0.8829010725021362
Epoch  4
 Val_loss:  1.5648176670074463  Accuracy:  0.8964281678199768
Epoch  5
 Val_loss:  1.5646613836288452  Accuracy:  0.8964554071426392
Epoch  6
 Val_loss:  1.5434991121292114  Accuracy:  0.9175702929496765
Epoch  7
 Val_loss:  1.5681283473968506  Accuracy:  0.8935266733169556
Epoch  8
 Val_loss:  1.549047589302063  Accuracy:  0.912093997001648
Epoch  9
 Val_loss:  1.5431967973709106  Accuracy:  0.917556643486023
Epoch  10
 Val_loss:  1.539486050605774  Accuracy:  0.9209622740745544
Epoch  11
 Val_loss:  1.586056113243103  Accuracy:  0.8748500943183899
Epoch  12
 Val_loss:  1.5462374687194824  Accuracy:  0.9141919016838074
Epoch  13
 Val_loss:  1.5362443923950195  Accuracy:  0.9247766733169556
Epoch  14
 Val_loss:  1.551859974861145  Accuracy:  0.9099416732788086
Epoch  15
 Val_loss:  1.5897703170776367  Accuracy:  0.870191216468811

history.append(fit(model, train_dl, val_dl, 16, 0.0001))

Epoch  0
 Val_loss:  1.5272799730300903  Accuracy:  0.9323506951332092
Epoch  1
 Val_loss:  1.5246087312698364  Accuracy:  0.9361786842346191
Epoch  2
 Val_loss:  1.5204848051071167  Accuracy:  0.9403743147850037
Epoch  3
 Val_loss:  1.5218404531478882  Accuracy:  0.9391074180603027
Epoch  4
 Val_loss:  1.5220248699188232  Accuracy:  0.9386987686157227
Epoch  5
 Val_loss:  1.5220173597335815  Accuracy:  0.9386987686157227
Epoch  6
 Val_loss:  1.5211602449417114  Accuracy:  0.9399656057357788
Epoch  7
 Val_loss:  1.5215883255004883  Accuracy:  0.9395433664321899
Epoch  8
 Val_loss:  1.5230512619018555  Accuracy:  0.9369959235191345
Epoch  9
 Val_loss:  1.519636869430542  Accuracy:  0.9407966136932373
Epoch  10
 Val_loss:  1.5225540399551392  Accuracy:  0.938276469707489
Epoch  11
 Val_loss:  1.5214532613754272  Accuracy:  0.9391347765922546
Epoch  12
 Val_loss:  1.5224342346191406  Accuracy:  0.9378541707992554
Epoch  13
 Val_loss:  1.5185908079147339  Accuracy:  0.9420635104179382
Epoch  14
 Val_loss:  1.520158290863037  Accuracy:  0.9403879046440125
Epoch  15
 Val_loss:  1.5189688205718994  Accuracy:  0.9412325024604797

evaluate(model, val_dl)

{'loss': 1.5189688205718994, 'accuracy': 0.9412325024604797}

torch.save(model, 'spokendigit_cnn_mel.pth')

model = torch.load('spokendigit_cnn_mel.pth')

losses = []
accuracies = []

# gathering metrics across all epochs
for i in range(len(history)):
    for j in history[i]:
        losses.append(j['loss'])
        accuracies.append(j['accuracy'])

# function to plot metrics
def plot(var, title):
    plt.plot(var, '-x')
    plt.title(title)

plot(losses, 'Losses')

plot(accuracies, 'Accuracy')

Testing against data outside the dataset¶

(My recorded voice)

# function to get mel spectogram image
def get_mel(data, sr):
    fig = plt.figure(figsize=[1,1])
    ax = fig.add_subplot(111)
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)
    ax.set_frame_on(False)
    
    # mel spectograms
    S = librosa.feature.melspectrogram(y=data, sr=sr)
    librosa.display.specshow(librosa.power_to_db(S, ref=np.max), x_axis='time', y_axis='mel', fmin=50, fmax=280)
    file  = 'sample.jpg'
    plt.savefig(file, bbox_inches='tight',pad_inches=0)
    plt.close()

    img = Image.open(file)

    transform = transforms.Compose([transforms.ToTensor()])

    img = transform(np.asarray(img))
    
    # delete unnecessary file, not needed anymore.
    os.remove('sample.jpg')
    
    return img

# function to get output from audio sample
def get_prediction(model, img):  
    # adding dimension corresponding to batch (3, 54, 55) --> (1, 3, 54, 55)
    output = model(img.unsqueeze(0)).detach().numpy()
    num = np.argmax(output) 

    return output, num

# List of user recorded samples
user_samples = ['user_sample_1.wav', 'user_sample_2.wav', 'user_sample_3.wav', 'user_sample_5.wav']
sample_id = random.randint(0,3)    # select random sample

data, sr = librosa.load('melimages/'+user_samples[sample_id])
output, prediction = get_prediction(model, get_mel(data, sr))
print("Predicted {} with probability {}.".format(prediction, output[0][prediction]))

ipd.Audio(data = data, rate = sr)

Predicted 1 with probability 1.0.

/opt/tljh/user/lib/python3.7/site-packages/librosa/display.py:974: MatplotlibDeprecationWarning: The 'basey' parameter of __init__() has been renamed 'base' since Matplotlib 3.3; support for the old name will be dropped two minor releases later.
  scaler(mode, **kwargs)
/opt/tljh/user/lib/python3.7/site-packages/librosa/display.py:974: MatplotlibDeprecationWarning: The 'linthreshy' parameter of __init__() has been renamed 'linthresh' since Matplotlib 3.3; support for the old name will be dropped two minor releases later.
  scaler(mode, **kwargs)
/opt/tljh/user/lib/python3.7/site-packages/torchvision/transforms/functional.py:74: UserWarning: The given NumPy array is not writeable, and PyTorch does not support non-writeable tensors. This means you can write to the underlying (supposedly non-writeable) NumPy array using the tensor. You may want to copy the array to protect its data or make it writeable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at  /pytorch/torch/csrc/utils/tensor_numpy.cpp:141.)
  img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()

deepC¶

meldset[0][0].shape

torch.Size([3, 54, 55])

dummy = meldset[0][0].unsqueeze(0)
torch.onnx.export(model, dummy, 'spokendigit_cnn_mel.onnx')

!deepCC spokendigit_cnn_mel.onnx

[INFO]
Reading [onnx model] 'spokendigit_cnn_mel.onnx'
[INFO]
Model info:
  ir_vesion : 6
  doc       : 
[INFO]
Running DNNC graph sanity check ...
[SUCCESS]
Passed sanity check.
[INFO]
Writing C++ file 'spokendigit_cnn_mel_deepC/spokendigit_cnn_mel.cpp'
[INFO]
deepSea model files are ready in 'spokendigit_cnn_mel_deepC/' 
[RUNNING COMMAND]
g++ -std=c++11 -O3 -fno-rtti -fno-exceptions -I. -I/opt/tljh/user/lib/python3.7/site-packages/deepC-0.13-py3.7-linux-x86_64.egg/deepC/include -isystem /opt/tljh/user/lib/python3.7/site-packages/deepC-0.13-py3.7-linux-x86_64.egg/deepC/packages/eigen-eigen-323c052e1731 "spokendigit_cnn_mel_deepC/spokendigit_cnn_mel.cpp" -D_AITS_MAIN -o "spokendigit_cnn_mel_deepC/spokendigit_cnn_mel.exe"
[RUNNING COMMAND]
size "spokendigit_cnn_mel_deepC/spokendigit_cnn_mel.exe"
   text	   data	    bss	    dec	    hex	filename
2523637	   3560	    760	2527957	 2692d5	spokendigit_cnn_mel_deepC/spokendigit_cnn_mel.exe
[SUCCESS]
Saved model as executable "spokendigit_cnn_mel_deepC/spokendigit_cnn_mel.exe"

# List of user recorded samples
user_samples = ['user_sample_1.wav', 'user_sample_2.wav', 'user_sample_3.wav', 'user_sample_5.wav']
sample_id = random.randint(0,3)    # select random sample

# write to input file img.data
data, sr = librosa.load('melimages/'+user_samples[sample_id])
img_data = get_mel(data, sr).unsqueeze(0)
np.savetxt('img.data', img_data.flatten())

# pass to .exe file
!spokendigit_cnn_mel_deepC/spokendigit_cnn_mel.exe img.data

# show predicted output
nn_out = np.loadtxt('deepSea_result_1.out')
print ("Model predicted the digit ", np.argmax(nn_out))

ipd.Audio(data = data, rate = sr)

writing file deepSea_result_1.out.
Model predicted the digit  1

	File	Label
0	eight/e32ff49d_nohash_0.wav	eight
1	five/840eab5a_nohash_0.wav	five
2	six/541e4079_nohash_0.wav	six
3	one/b9cccd01_nohash_0.wav	one
4	seven/28e47b1a_nohash_2.wav	seven
...	...	...
23661	four/d33df435_nohash_0.wav	four
23662	two/1365dd89_nohash_0.wav	two
23663	zero/471a0925_nohash_0.wav	zero
23664	two/11b1df78_nohash_1.wav	two
23665	three/541e4079_nohash_2.wav	three

Model Files
spoken_digit_recognition.onnx Model ONNX
deepSea Compiled Models
spoken_digit_recognition.exe deepSea Ubuntu