Use mel spectogram images of the audio samples to recognize the spoken digit. The mel scale is a closer representation of the human audio perception than the standard scale.
The same model can be extended to recognize generally spoken words.
import torch
from torch.utils.data import Dataset, random_split, DataLoader, TensorDataset
import torchvision
from torchvision.datasets.utils import download_url
import torch.nn as nn
import torch.nn.functional as F
import os
import librosa
import pandas as pd
import numpy as np
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import librosa.display
import sklearn
import matplotlib
import csv
from PIL import Image
from sklearn.metrics import f1_score
import IPython.display as ipd
import random
The below cell contains the zip file for the dataset and other files required in this notebook.
!wget -N https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/spoken_digit_melimages.zip
!unzip -qo spoken_digit_melimages.zip
Data¶
digit = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
The audio dataset is a subset of the Tensorflow speech commands dataset. The subset consisting of digits is too large to be included here.
Each sample is a 1 second long mono audio recorded at 8000 hz.
Here is one sample of each digit.
# Random sample selection
digit_audio_sample = ['dataset_sample_0.wav', 'dataset_sample_1.wav', 'dataset_sample_2.wav', 'dataset_sample_3.wav', 'dataset_sample_4.wav', 'dataset_sample_5.wav', 'dataset_sample_6.wav', 'dataset_sample_7.wav', 'dataset_sample_8.wav', 'dataset_sample_9.wav']
i = random.randint(0, 9)
# Selecting the sample by label
#i = 4 # Label = 4
data, sr = librosa.load('melimages/'+digit_audio_sample[i])
fig = plt.figure()
# Two subplots
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
print("Label: ", i, '\n')
print("Sampling rate: ", sr, "\n")
# mel spectogram
S = librosa.feature.melspectrogram(y=data, sr=sr)
librosa.display.specshow(librosa.power_to_db(S, ref=np.max), x_axis='time', y_axis='mel', ax = ax1)
# waveplot
librosa.display.waveplot(np.array(data), sr=sr, ax = ax2)
ipd.Audio(data = data, rate = sr)
Extracting data¶
No need to execute the following commented code, extracted files included in dataset.
Execution of the data extraction cells below requires the tensorflow speech commands dataset to be downloaded (~1GB).
Link to download the audio dataset : http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz
Creating the reference csv file.¶
'''
# Creating a reference csv file
with open("Spoken_digit.csv", 'w') as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(["File", "Label"])
for x in digit:
if os.path.isdir('/content/data/'+x):
for name in os.listdir('/content/data/'+x):
if os.path.isfile('/content/data/'+x+"/"+name):
csvwriter.writerow([x+'/'+name, x])
# shuffle
df = pd.read_csv('Spoken_digit.csv')
df = df.sample(frac=1)
df.to_csv('Spoken_digit.csv', index = False)
'''
Extracting the mel spectogram images from the audio samples.¶
'''
# -- Extraction can take about 8 hours
# Making folders to store the extracted images
path = ''
os.makedirs(path+'/melimages')
for i in digit:
os.makedirs(path+'/melimages/'+i)
os.makedirs(path+'/melimages100/'+i)
os.makedirs(path+'/melimages200/'+i)
for i in range(0, len(sp)):
print(i) # Print to keep track of which file is being processed
f = sp.loc[i]
name = f.File
data, sr = librosa.load('/content/data/'+name)
fig = plt.figure(figsize=[1,1])
ax = fig.add_subplot(111)
ax.axes.get_xaxis().set_visible(False)
ax.axes.get_yaxis().set_visible(False)
ax.set_frame_on(False)
S = librosa.feature.melspectrogram(y=data, sr=sr)
librosa.display.specshow(librosa.power_to_db(S, ref=np.max), x_axis='time', y_axis='mel')
file = path+'/melimages/'+str(name[:-4]) + '.jpg'
plt.savefig(file, bbox_inches='tight',pad_inches=0)
plt.close()
'''
Note¶
The extracted mel images and the csv file are directly available as a dataset at the download link at the start of the notebook.
Folder - melimages
The dataset folder contains:
* 10 folders, one for each digit. These folders have mel spectogram images of corresponding audio samples.
* 10 samples, one for each digit, from the dataset as shown above. This helps the notebook user understand the dataset better as the original samples are not included here.
* 4 user recorded external samples not present in dataset (train or val)
Spoken_digit.csv : A csv file that has two columns
* File - Name of file/path to file in dataset
* Label - Label of the sample
# Spoken_digit.csv
spokendigit = pd.read_csv('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/Spoken_digit.csv')
spokendigit
spokendigit['Label'].value_counts()
This is a balanced dataset with ~2360 samples per digit.
Mel dataset¶
class SpokenDigit(Dataset):
def __init__(self, file = None, rootdir = None, transform = None):
self.df = pd.read_csv(file)
self.rootdir = rootdir # root directory for the images
self.transform = transform
def __len__(self):
return len(self.df)
def __getitem__(self, i):
row = self.df.loc[i]
fname, label = row['File'], row['Label']
ik = self.rootdir+fname[:-4]+'.jpg'
img = Image.open(ik)
if self.transform:
img = self.transform(img)
return img, torch.tensor(digit.index(label)) # return image tensor and numeric label
def getsr(self, i):
row = self.df.loc[i]
fname, label = row['File'], row['Label']
_, sr = librosa.load(self.rootdir+fname)
return sr
meldset = SpokenDigit('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/Spoken_digit.csv', 'melimages/', transforms.Compose([transforms.ToTensor()]))
# train - val split of 90-10
size = len(meldset)
val_size = int(0.1 * size)
train_size = size - val_size
train_dset, val_dset = random_split(meldset, [train_size, val_size])
print("Number of samples in train set: ", train_size)
print("Number of samples in validation set: ", val_size)
train_dl = DataLoader(train_dset, 64, True, num_workers=6, pin_memory=True)
val_dl = DataLoader(val_dset, 64, num_workers=6, pin_memory=True)
Model¶
class SpokenDigitModel(nn.Module):
def __init__(self):
super().__init__()
self.network = nn.Sequential(
nn.Conv2d(3, 16, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2, 2),
nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.MaxPool2d(2, 2),
nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.MaxPool2d(2, 2),
nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.MaxPool2d(2, 2),
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.MaxPool2d(2, 2),
nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.AdaptiveAvgPool2d(1),
nn.Flatten(),
nn.Linear(256, 128),
nn.ReLU(),
nn.Linear(128, 64),
nn.ReLU(),
nn.Linear(64, 10),
nn.Softmax(dim = 1)
)
def forward(self, x):
return self.network(x)
def training_step(self, batch):
inputs, labels = batch
outputs = self(inputs)
loss = F.cross_entropy(outputs, labels) # cros entropy loss
return loss
def validation_step(self, batch):
inputs, labels = batch
outputs = self(inputs)
loss = F.cross_entropy(outputs, labels)
_, pred = torch.max(outputs, 1)
accuracy = torch.tensor(torch.sum(pred==labels).item()/len(pred)) # calculate accuracy
return [loss.detach(), accuracy.detach()]
def evaluate(model, loader):
model.eval()
outputs = [model.validation_step(batch) for batch in loader]
outputs = torch.tensor(outputs).T
loss, accuracy = torch.mean(outputs, dim=1)
return {"loss" : loss.item(), "accuracy" : accuracy.item()}
def fit(model, train_loader, val_loader, epochs, lr, optimizer_function = torch.optim.Adam):
history = []
optimizer = optimizer_function(model.parameters(), lr)
for epoch in range(epochs):
print("Epoch ", epoch)
#Train
model.train()
for batch in train_loader:
loss = model.training_step(batch)
loss.backward()
optimizer.step()
optimizer.zero_grad()
#Validate
result = evaluate(model, val_loader)
print(" Val_loss: ", result['loss'], " Accuracy: ", result['accuracy'])
history.append(result)
return history
Training the model¶
model = SpokenDigitModel()
history = []
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
# evaluating on an untrained model
evaluate(model, val_dl)
from torchsummary import summary
import torch
import torchvision
from torch import nn
from torchvision import models
print(model)
count_parameters(model)
history.append(fit(model, train_dl, val_dl, 16, 0.001))
history.append(fit(model, train_dl, val_dl, 16, 0.0001))
evaluate(model, val_dl)
torch.save(model, 'spokendigit_cnn_mel.pth')
model = torch.load('spokendigit_cnn_mel.pth')
losses = []
accuracies = []
# gathering metrics across all epochs
for i in range(len(history)):
for j in history[i]:
losses.append(j['loss'])
accuracies.append(j['accuracy'])
# function to plot metrics
def plot(var, title):
plt.plot(var, '-x')
plt.title(title)
plot(losses, 'Losses')
plot(accuracies, 'Accuracy')
Testing against data outside the dataset¶
(My recorded voice)
# function to get mel spectogram image
def get_mel(data, sr):
fig = plt.figure(figsize=[1,1])
ax = fig.add_subplot(111)
ax.axes.get_xaxis().set_visible(False)
ax.axes.get_yaxis().set_visible(False)
ax.set_frame_on(False)
# mel spectograms
S = librosa.feature.melspectrogram(y=data, sr=sr)
librosa.display.specshow(librosa.power_to_db(S, ref=np.max), x_axis='time', y_axis='mel', fmin=50, fmax=280)
file = 'sample.jpg'
plt.savefig(file, bbox_inches='tight',pad_inches=0)
plt.close()
img = Image.open(file)
transform = transforms.Compose([transforms.ToTensor()])
img = transform(np.asarray(img))
# delete unnecessary file, not needed anymore.
os.remove('sample.jpg')
return img
# function to get output from audio sample
def get_prediction(model, img):
# adding dimension corresponding to batch (3, 54, 55) --> (1, 3, 54, 55)
output = model(img.unsqueeze(0)).detach().numpy()
num = np.argmax(output)
return output, num
# List of user recorded samples
user_samples = ['user_sample_1.wav', 'user_sample_2.wav', 'user_sample_3.wav', 'user_sample_5.wav']
sample_id = random.randint(0,3) # select random sample
data, sr = librosa.load('melimages/'+user_samples[sample_id])
output, prediction = get_prediction(model, get_mel(data, sr))
print("Predicted {} with probability {}.".format(prediction, output[0][prediction]))
ipd.Audio(data = data, rate = sr)
deepC¶
meldset[0][0].shape
dummy = meldset[0][0].unsqueeze(0)
torch.onnx.export(model, dummy, 'spokendigit_cnn_mel.onnx')
!deepCC spokendigit_cnn_mel.onnx
# List of user recorded samples
user_samples = ['user_sample_1.wav', 'user_sample_2.wav', 'user_sample_3.wav', 'user_sample_5.wav']
sample_id = random.randint(0,3) # select random sample
# write to input file img.data
data, sr = librosa.load('melimages/'+user_samples[sample_id])
img_data = get_mel(data, sr).unsqueeze(0)
np.savetxt('img.data', img_data.flatten())
# pass to .exe file
!spokendigit_cnn_mel_deepC/spokendigit_cnn_mel.exe img.data
# show predicted output
nn_out = np.loadtxt('deepSea_result_1.out')
print ("Model predicted the digit ", np.argmax(nn_out))
ipd.Audio(data = data, rate = sr)