Everyday sound classification¶
Credit: AITS Cainvas Community
Danger identification¶
Photo by Ekrem EDALI on Dribbble
Identify sounds that indicate possible danger in the surrounding.
import pandas as pd
import numpy as np
import librosa
import tensorflow as tf
import os
import IPython.display as ipd
import time
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import random
The dataset¶
On Kaggle by Chris Gorgolewski
The UrbanSound8K dataset is commonly used for academic research. It contains 8732 labeled sound excerpts of urban sounds from 10 classes:
air_conditioner, car_horn, children_playing, dog_bark, drilling, enginge_idling, gun_shot, jackhammer, siren, and street_music.
The classes are drawn from the urban sound taxonomy. Each file is <=4s in duration. The files are pre-sorted into ten folds (folders named fold1-fold10) to help in the reproduction of and comparison with the results reported in academic papers. In addition to the sound excerpts, a CSV file containing metadata about each excerpt is also provided.
!wget -O archive.zip "https://storage.googleapis.com/kaggle-data-sets/500970/928025/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20210307%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20210307T062240Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=host&X-Goog-Signature=56582821d50b586320ec4577bddf184342c308bed951e3611c9aa67e83261cbced21ed4c3cfc796682080b9690d9d9031570458a605730cf635b93bde57666da8a42bf48a1607f8c43a437ee8ce9af05e95005ff2659f6c3279f77f079d697fafed30fc6a0f2702550e2fc8cc3305d896f6fc8726003fd376999b10b62752e3e0f30981c814652384a708ae9a03fab3493edc0ba77fdf21b3ee2300d28ff45b676a8d688798f14e903f8beaa55ba7247bfa00cd806441508fc8eac9a011affcb816dc03a963b0d891efa16cfc711e34f341775a67e673ea1fe907fe401d4d496fe794f61509666d8bd5361d6aebf15b464b2db48c696e9f0b872eef4e86620ed"
!unzip -q archive.zip -d audio
!rm archive.zip
df = pd.read_csv('audio/UrbanSound8K.csv')
df
df['class'].value_counts()
max_length = 4 #length (in seconds) of input
desired_sr = 16000 #sampling rate to use
# NOTE: downsampling it to 4000 results in significant accuracy drop to 60%
mic_sr = 16000 # rate supported by Sampling library like PDM
desired_samples = max_length*desired_sr #total number of samples in input
tf.random.set_seed(0)
np.random.seed(0)
os.environ['PYTHONHASHSEED'] = '0'
Utility functions¶
#Processing the data
dataset_min = 0.0
dataset_max = 1.0
def denormalize_dataset(input_val):
global dataset_min, dataset_max
return input_val * (dataset_max - dataset_min)
#Function to normalize input values
def normalize_dataset(input_val):
global dataset_min, dataset_max
dataset_min = np.min(input_val)
dataset_max = np.max(input_val)
diff = dataset_max - dataset_min
if (diff != 0):
input_val /= diff
return input_val
def interpolateAudio(audio):
factor = float(mic_sr)/desired_sr
x_interp_values = []
for i in range(len(audio)):
x_interp_values.append(int(factor*i))
audio_interpolated = np.interp(range(int(len(audio)*factor)), x_interp_values, audio)
return mic_sr, audio_interpolated
word_dirs = list(set(df['class'].to_list()))
hotwords = ['jackhammer', 'dog_bark', 'siren', 'gun_shot']
print("All words in dataset - \n", ', '.join(word_dirs))
print("\nHotwords - \n", ', '.join(hotwords))
Handling labels and background samples¶
add_noise = False # add different words, null samples and random noise
n_classes = len(hotwords) + int(add_noise)
class_nSamples = 1000
other_nSamples = float(class_nSamples)/(len(word_dirs) - n_classes)
def nLabel(word):
return n_classes-1 if ( word not in hotwords ) else hotwords.index(word)
def textLabel(index):
return hotwords[index] if index <len(hotwords) else "background"
def sampleBackGround():
return add_noise
Creating the dataset¶
#Dataset storing audio samples for wake word and background
top_dir = "audio"
input_audio = np.empty((0, desired_samples)).astype(np.float32)
input_labels = np.empty((0)).astype(np.int32); # index of the word in hotwords list is the lable.
for word in (word_dirs) :
print("\n",word)
if ( word not in hotwords and False == sampleBackGround()) : # background, do not include
print("-- Background/noise/other words not included")
continue
else: # to be included
dfx = df[df['class'] == word]
start_time = time.time()
wav_files = 0
word_samples = np.empty((0, desired_samples))
if word in hotwords: # hotwords
print("-- Category : hotword")
for i in range(len(dfx)):
file_path = top_dir + "/fold" + str(dfx.iloc[i]['fold']) + "/" + str(dfx.iloc[i]['slice_file_name'])
X_sub = np.empty((0, desired_samples))
X, sr = librosa.core.load(file_path, sr=desired_sr)
X, interval = librosa.effects.trim(X)
if X.shape[0] < desired_sr: # if samples less than 1 second
continue
if X.shape[0]%desired_samples != 0: # if it needs padding, else, there will be unnecessary silence appended
X = np.pad(X, (0, desired_samples - (X.shape[0]%desired_samples)))
X_sub = np.array(np.split(X, int(X.shape[0]*1.0/desired_samples)))
word_samples = np.append(word_samples, X_sub, axis=0)
if ( word_samples.shape[0] > class_nSamples ):
break
wav_files = wav_files + 1
else:
print("-- Category : backgound/noise/other words")
for i in range(len(dfx)):
file_path = top_dir + "/fold" + str(dfx.iloc[i]['fold']) + "/" + str(dfx.iloc[i]['slice_file_name'])
X, sr = librosa.core.load(file_path, sr=desired_sr)
X, interval = librosa.effects.trim(X)
X = np.pad(X, (0,desired_samples - (X.shape[0]%desired_samples)))
X_sub = np.array(np.split(X, int(X.shape[0]*1.0/desired_samples)))
word_samples = np.append(word_samples, X_sub, axis=0)
if ( word_samples.shape[0] > other_nSamples ):
break
wav_files = wav_files + 1
if ( word_samples.size > 0 ):
input_audio = np.concatenate((input_audio, word_samples), axis=0)
labels = np.full((word_samples.shape[0]), nLabel(word))
input_labels = np.concatenate((input_labels, labels))
print("added {} audio files with {} samples for word \"{}\" with label {} in {:.1f} sec.".
format(wav_files, labels.shape[0], word, nLabel(word), (time.time() - start_time)))
# Concatenating dataset into matrix of inputs and labels
onehot_labels = np.zeros((input_labels.size, n_classes)).astype(np.int32)
onehot_labels[np.arange(input_labels.size), input_labels] = 1
input_labels = onehot_labels
print("Input dataset size:", input_audio.shape)
print("Input targets size:", input_labels.shape)
Adding noise/silence as background¶
# Add 10% of random noise and 10% of silent samples as background.
if ( sampleBackGround() ) :
n_bg_samples = int(other_nSamples)
bg_labels = np.zeros((n_bg_samples, n_classes)).astype(np.int)
bg_labels[:,n_classes-1] = 1
silence = np.zeros((n_bg_samples, desired_samples))
input_audio = np.append(input_audio, silence, axis=0)
input_labels = np.append(input_labels, bg_labels, axis=0)
background = np.zeros((n_bg_samples, desired_samples))
input_audio = np.append(input_audio, background, axis=0)
input_labels = np.append(input_labels, bg_labels, axis=0)
# %xdel background
# %xdel silence
# %xdel bg_labels
Extracting STFT features¶
# hop_len=int(win_len/4) # default
# fft_len=pow(2, int(np.log2(win_len)+1))
fft_len = 2048
win_len = fft_len
hop_len = int(win_len/4)
def spectrogramOp(X):
# STFT returns np.ndarray of shape=(1 + fft_len/2, t)
spectrogram_out = librosa.core.stft(X, n_fft=fft_len, hop_length=hop_len, win_length=win_len, center=True)
# spectrogram_out = np.swapaxes(np.abs(spectrogram_out), 0, 1)
return np.absolute(spectrogram_out)
#inputs = np.array([spectrogramOp(input) for input in input_audio])
input_spectrogram = np.empty((input_audio.shape[0], int(fft_len/2 + 1), int(desired_samples/hop_len + 1))).astype(np.float32)
i = 0 ;
for input in input_audio:
input_spectrogram[i] = spectrogramOp(input)
i = i + 1
print("input dataset size:", input_spectrogram.shape)
Normalize¶
# Normalize
input_spectrogram = normalize_dataset(input_spectrogram)
Train-test split¶
total_len = input_labels.shape[0]
#Shuffling inputs and labels
shuffle_permutation = np.arange(total_len)
np.random.shuffle(shuffle_permutation)
input_spectrogram = input_spectrogram[shuffle_permutation]
input_labels = input_labels[shuffle_permutation]
#Splitting into train and test dataset - 90-10 ratio
train_split = 0.9
cutoff = int(train_split*total_len)
inputs_train = input_spectrogram[:cutoff]
inputs_test = input_spectrogram[cutoff:]
labels_train = input_labels[:cutoff]
labels_test = input_labels[cutoff:]
Play using inverse-STFT¶
#Selecting random index from test dataset
ind = int(np.random.uniform()*len(inputs_train))
#Displaying sample spectrogram and audio from test dataset
X = inputs_train[ind]
y = labels_train[ind].argmax()
print("Label :", textLabel(y) )
plt.imshow(X, cmap='hot', interpolation='nearest', aspect='auto')
plt.show()
audio = librosa.core.istft(X, hop_length=hop_len, win_length=win_len)
ipd.Audio(audio, rate=desired_sr, autoplay=True)
Model¶
model = tf.keras.Sequential()
# (-1, 126, 65) = inputs_train.shape. Replace the numbers from inputs_train, if it changes.
# 126 = int(desired_samples/hop_len + 1), = time axis
# 65 = int(fft_len/2 + 1) = freq bins
lambda1 = tf.keras.layers.Lambda(lambda x: tf.reshape(x, (-1, int(fft_len/2 + 1), int(desired_samples/hop_len + 1), 1)),
name="add_channels", input_shape=(None, int(fft_len/2 + 1), int(desired_samples/hop_len + 1)))
conv2d1 = tf.keras.layers.Conv2D(16, (int(fft_len/2 + 1), 4), strides=1, activation='relu', name="conv1",
input_shape=(int(fft_len/2 + 1), int(desired_samples/hop_len + 1), 1))
conv2d2 = tf.keras.layers.Conv2D(32, (1, 4), strides=4, activation='relu', name="conv2")
conv2d3 = tf.keras.layers.Conv2D(64, (1, 4), strides=3, activation='relu', name="conv3")
flatten1 = tf.keras.layers.Flatten()
dense1 = tf.keras.layers.Dense(6*n_classes)
dense2 = tf.keras.layers.Dense(n_classes)
dropout = tf.keras.layers.Dropout(0.2)
activation1 = tf.keras.layers.Activation('softmax')
model.add(lambda1)
model.add(conv2d1)
model.add(conv2d2)
model.add(conv2d3)
model.add(flatten1)
model.add(dense1)
model.add(dense2)
model.add(dropout)
model.add(activation1)
model.compile(optimizer=tf.keras.optimizers.Adam(0.01), metrics=['accuracy'],
loss=(tf.keras.losses.binary_crossentropy if (n_classes==2) else tf.keras.losses.categorical_crossentropy))
model.summary()
callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, restore_best_weights = True)]
model.fit(inputs_train, labels_train, batch_size=64, epochs=1024, callbacks=callbacks, validation_data=(inputs_test, labels_test))
model.evaluate(inputs_test, labels_test)
ind = int(np.random.uniform()*len(inputs_test))
spectrogram_out = inputs_test[ind]
ipd.Audio(spectrogram_out, rate=desired_sr)
y = labels_test[ind]
output = model.predict(np.expand_dims(np.array([inputs_test[ind]]), 0))
print("True label:", textLabel(np.argmax(y)))
print("Prediction:", textLabel(np.argmax(output)))
plt.imshow(spectrogram_out, cmap='hot', interpolation='nearest', aspect='auto')
plt.show()
audio = librosa.core.istft(spectrogram_out, hop_length=hop_len, win_length=win_len)
playback_sr, audio_interpolated = interpolateAudio(audio)
ipd.Audio(audio_interpolated, rate=playback_sr, autoplay=True)
Confusion matrix¶
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
predictions_test = model.predict(inputs_test)
cm = confusion_matrix(labels_test.argmax(axis=1), predictions_test.argmax(axis=1))
tck_labels = hotwords+["noise"] if ( sampleBackGround() ) else hotwords
sns.heatmap(cm, xticklabels=tck_labels, yticklabels=tck_labels, annot=True,fmt="d",)
plt.title("Validation Confusion Matrix")
plt.ylabel("A C T U A L")
plt.xlabel("P R E D I C T E D")
plt.show()
On background and silence¶
background = spectrogramOp(np.random.random((desired_samples)))
silence = spectrogramOp(np.zeros((desired_samples)))
background_out, silence_out = model.predict(np.array([background, silence]))
print("Predicted ", textLabel(background_out.argmax()), "on random audio with vector", background_out)
print("Predicted ", textLabel(silence_out.argmax()), "on null audio with vector", silence_out)
On longer clips¶
n = 4 # number of samples
X, y = [], []
top_dir = "audio"
for i in range(n):
i = int(np.random.uniform()*len(df))
file_path = file_path = top_dir + "/fold" + str(df.iloc[i]['fold']) + "/" + str(df.iloc[i]['slice_file_name'])
audio, sr = librosa.core.load(file_path, sr=desired_sr, mono=True)
audio, interval = librosa.effects.trim(audio, top_db=30)
audio = np.pad(audio, int((desired_samples-audio.shape[0])/2))
X = np.append(X, audio)
y.append(df.iloc[i]['class'])
X = np.pad(X, (0, desired_samples - (X.shape[0]%desired_samples)))
assert sr == desired_sr #ensure sample rate is same as desired
assert len(X.shape) == 1 #ensure X is a mono signal
spectrogram_out = spectrogramOp(X)
plt.imshow(spectrogram_out, cmap='hot', interpolation='nearest', aspect='auto')
plt.show()
print("The true labels are:", ', '.join(y))
ipd.Audio(X, rate=sr, autoplay=True)