Everyday sound classification¶

Danger identification¶

Identify sounds that indicate possible danger in the surrounding.

import pandas as pd
import numpy as np
import librosa
import tensorflow as tf
import os
import IPython.display as ipd
import time
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import random

The dataset¶

On Kaggle by Chris Gorgolewski

The UrbanSound8K dataset is commonly used for academic research. It contains 8732 labeled sound excerpts of urban sounds from 10 classes:

air_conditioner, car_horn, children_playing, dog_bark, drilling, enginge_idling, gun_shot, jackhammer, siren, and street_music.

The classes are drawn from the urban sound taxonomy. Each file is <=4s in duration. The files are pre-sorted into ten folds (folders named fold1-fold10) to help in the reproduction of and comparison with the results reported in academic papers. In addition to the sound excerpts, a CSV file containing metadata about each excerpt is also provided.

!wget -O archive.zip "https://storage.googleapis.com/kaggle-data-sets/500970/928025/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20210307%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20210307T062240Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=host&X-Goog-Signature=56582821d50b586320ec4577bddf184342c308bed951e3611c9aa67e83261cbced21ed4c3cfc796682080b9690d9d9031570458a605730cf635b93bde57666da8a42bf48a1607f8c43a437ee8ce9af05e95005ff2659f6c3279f77f079d697fafed30fc6a0f2702550e2fc8cc3305d896f6fc8726003fd376999b10b62752e3e0f30981c814652384a708ae9a03fab3493edc0ba77fdf21b3ee2300d28ff45b676a8d688798f14e903f8beaa55ba7247bfa00cd806441508fc8eac9a011affcb816dc03a963b0d891efa16cfc711e34f341775a67e673ea1fe907fe401d4d496fe794f61509666d8bd5361d6aebf15b464b2db48c696e9f0b872eef4e86620ed"

!unzip -q archive.zip -d audio

!rm archive.zip

--2021-03-07 09:59:09--  https://storage.googleapis.com/kaggle-data-sets/500970/928025/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20210307%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20210307T062240Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=host&X-Goog-Signature=56582821d50b586320ec4577bddf184342c308bed951e3611c9aa67e83261cbced21ed4c3cfc796682080b9690d9d9031570458a605730cf635b93bde57666da8a42bf48a1607f8c43a437ee8ce9af05e95005ff2659f6c3279f77f079d697fafed30fc6a0f2702550e2fc8cc3305d896f6fc8726003fd376999b10b62752e3e0f30981c814652384a708ae9a03fab3493edc0ba77fdf21b3ee2300d28ff45b676a8d688798f14e903f8beaa55ba7247bfa00cd806441508fc8eac9a011affcb816dc03a963b0d891efa16cfc711e34f341775a67e673ea1fe907fe401d4d496fe794f61509666d8bd5361d6aebf15b464b2db48c696e9f0b872eef4e86620ed
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.192.144, 216.58.203.144, 216.58.196.80, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.192.144|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6026232524 (5.6G) [application/zip]
Saving to: ‘archive.zip’

archive.zip         100%[===================>]   5.61G   157MB/s    in 43s     

2021-03-07 09:59:53 (133 MB/s) - ‘archive.zip’ saved [6026232524/6026232524]

replace audio/UrbanSound8K.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C

df = pd.read_csv('audio/UrbanSound8K.csv')
df

df['class'].value_counts()

children_playing    1000
street_music        1000
engine_idling       1000
drilling            1000
dog_bark            1000
jackhammer          1000
air_conditioner     1000
siren                929
car_horn             429
gun_shot             374
Name: class, dtype: int64

max_length = 4 #length (in seconds) of input
desired_sr = 16000 #sampling rate to use
                   # NOTE: downsampling it to 4000 results in significant accuracy drop to 60%
mic_sr = 16000 # rate supported by Sampling library like PDM
desired_samples = max_length*desired_sr #total number of samples in input

tf.random.set_seed(0)
np.random.seed(0)
os.environ['PYTHONHASHSEED'] = '0'

Utility functions¶

#Processing the data
dataset_min = 0.0
dataset_max = 1.0

def denormalize_dataset(input_val):
  global dataset_min, dataset_max
  return input_val * (dataset_max - dataset_min)

#Function to normalize input values
def normalize_dataset(input_val):
  global dataset_min, dataset_max
  dataset_min = np.min(input_val) 
  dataset_max = np.max(input_val) 

  diff = dataset_max - dataset_min
  if (diff != 0):
    input_val /= diff
  return input_val

def interpolateAudio(audio):
    factor = float(mic_sr)/desired_sr
    x_interp_values = []
    for i in range(len(audio)):
        x_interp_values.append(int(factor*i))
    audio_interpolated = np.interp(range(int(len(audio)*factor)), x_interp_values, audio)

    return mic_sr, audio_interpolated

Populate datasets¶

Picking the hotwords¶

word_dirs = list(set(df['class'].to_list()))
hotwords = ['jackhammer', 'dog_bark', 'siren', 'gun_shot']

print("All words in dataset - \n", ', '.join(word_dirs))
print("\nHotwords - \n", ', '.join(hotwords))

All words in dataset - 
 car_horn, jackhammer, street_music, drilling, children_playing, gun_shot, air_conditioner, engine_idling, siren, dog_bark

Hotwords - 
 jackhammer, dog_bark, siren, gun_shot

Handling labels and background samples¶

add_noise = False # add different words, null samples and random noise
n_classes = len(hotwords) + int(add_noise) 

class_nSamples = 1000
other_nSamples = float(class_nSamples)/(len(word_dirs) - n_classes)

def nLabel(word):
    return n_classes-1 if ( word not in hotwords ) else hotwords.index(word)

def textLabel(index):
    return hotwords[index] if index <len(hotwords) else "background"

def sampleBackGround():
    return add_noise

Creating the dataset¶

#Dataset storing audio samples for wake word and background

top_dir = "audio"

input_audio   = np.empty((0, desired_samples)).astype(np.float32)
input_labels  = np.empty((0)).astype(np.int32); # index of the word in hotwords list is the lable.

for word in (word_dirs) :
    print("\n",word)
    
    if ( word not in hotwords and False == sampleBackGround()) : # background, do not include
        print("-- Background/noise/other words not included")
        continue
        
    else: # to be included
        dfx = df[df['class'] == word]
        start_time = time.time()

        wav_files = 0

        word_samples = np.empty((0, desired_samples))
        
        if word in hotwords: # hotwords
            print("-- Category : hotword")
            
            for i in range(len(dfx)):
                file_path = top_dir + "/fold" + str(dfx.iloc[i]['fold']) + "/" + str(dfx.iloc[i]['slice_file_name'])

                X_sub = np.empty((0, desired_samples))
                X, sr = librosa.core.load(file_path, sr=desired_sr)
                X, interval = librosa.effects.trim(X)

                if X.shape[0] < desired_sr: # if samples less than 1 second
                    continue

                if X.shape[0]%desired_samples != 0: # if it needs padding, else, there will be unnecessary silence appended
                    X = np.pad(X, (0, desired_samples - (X.shape[0]%desired_samples)))
                
                X_sub = np.array(np.split(X, int(X.shape[0]*1.0/desired_samples)))
                
                word_samples = np.append(word_samples, X_sub, axis=0)

                if ( word_samples.shape[0] > class_nSamples ):
                    break

                wav_files = wav_files + 1
            
        else:
            print("-- Category : backgound/noise/other words")

            for i in range(len(dfx)):
                file_path = top_dir + "/fold" + str(dfx.iloc[i]['fold']) + "/" + str(dfx.iloc[i]['slice_file_name'])

                X, sr = librosa.core.load(file_path, sr=desired_sr)
                X, interval = librosa.effects.trim(X)
                X = np.pad(X, (0,desired_samples - (X.shape[0]%desired_samples)))
                X_sub = np.array(np.split(X, int(X.shape[0]*1.0/desired_samples)))

                word_samples = np.append(word_samples, X_sub, axis=0)

                if ( word_samples.shape[0] > other_nSamples ):
                    break
                
                wav_files = wav_files + 1
            
        if ( word_samples.size > 0 ):
            input_audio = np.concatenate((input_audio, word_samples), axis=0)
            labels = np.full((word_samples.shape[0]), nLabel(word))
            input_labels = np.concatenate((input_labels, labels))

            print("added {} audio files with {} samples for word \"{}\" with label {} in {:.1f} sec.".
                  format(wav_files, labels.shape[0], word, nLabel(word), (time.time() - start_time)))

 car_horn
-- Background/noise/other words not included

 jackhammer
-- Category : hotword
added 978 audio files with 978 samples for word "jackhammer" with label 0 in 273.8 sec.

 street_music
-- Background/noise/other words not included

 drilling
-- Background/noise/other words not included

 children_playing
-- Background/noise/other words not included

 gun_shot
-- Category : hotword
added 269 audio files with 269 samples for word "gun_shot" with label 3 in 34.6 sec.

 air_conditioner
-- Background/noise/other words not included

 engine_idling
-- Background/noise/other words not included

 siren
-- Category : hotword
added 915 audio files with 915 samples for word "siren" with label 2 in 231.1 sec.

 dog_bark
-- Category : hotword
added 863 audio files with 863 samples for word "dog_bark" with label 1 in 214.2 sec.

# Concatenating dataset into matrix of inputs and labels

onehot_labels = np.zeros((input_labels.size, n_classes)).astype(np.int32)
onehot_labels[np.arange(input_labels.size), input_labels] = 1

input_labels = onehot_labels
print("Input dataset size:", input_audio.shape)
print("Input targets size:", input_labels.shape)

Input dataset size: (3025, 64000)
Input targets size: (3025, 4)

Adding noise/silence as background¶

# Add 10% of random noise and 10% of silent samples as background.
if ( sampleBackGround() ) :
    n_bg_samples = int(other_nSamples)

    bg_labels    = np.zeros((n_bg_samples, n_classes)).astype(np.int)
    bg_labels[:,n_classes-1] = 1

    silence = np.zeros((n_bg_samples, desired_samples))
    input_audio = np.append(input_audio, silence, axis=0)
    input_labels = np.append(input_labels, bg_labels, axis=0)
    
    background = np.zeros((n_bg_samples, desired_samples))
    input_audio = np.append(input_audio, background, axis=0)
    input_labels = np.append(input_labels, bg_labels, axis=0)

#     %xdel background
#     %xdel silence
#     %xdel bg_labels

Extracting STFT features¶

# hop_len=int(win_len/4) # default
# fft_len=pow(2, int(np.log2(win_len)+1))
fft_len = 2048
win_len = fft_len
hop_len = int(win_len/4)

def spectrogramOp(X):
  # STFT returns np.ndarray of shape=(1 + fft_len/2, t)
  spectrogram_out = librosa.core.stft(X, n_fft=fft_len, hop_length=hop_len, win_length=win_len, center=True)
#  spectrogram_out = np.swapaxes(np.abs(spectrogram_out), 0, 1)
  return np.absolute(spectrogram_out)

#inputs = np.array([spectrogramOp(input) for input in input_audio])
input_spectrogram = np.empty((input_audio.shape[0], int(fft_len/2 + 1), int(desired_samples/hop_len + 1))).astype(np.float32)

i = 0 ;
for input in input_audio:
    input_spectrogram[i] = spectrogramOp(input) 
    i = i +  1
print("input dataset size:", input_spectrogram.shape)

input dataset size: (3025, 1025, 126)

Normalize¶

# Normalize
input_spectrogram = normalize_dataset(input_spectrogram)

Train-test split¶

total_len = input_labels.shape[0]

#Shuffling inputs and labels
shuffle_permutation = np.arange(total_len)
np.random.shuffle(shuffle_permutation)

input_spectrogram = input_spectrogram[shuffle_permutation]
input_labels = input_labels[shuffle_permutation]

#Splitting into train and test dataset - 90-10 ratio
train_split = 0.9
cutoff = int(train_split*total_len)

inputs_train = input_spectrogram[:cutoff]
inputs_test = input_spectrogram[cutoff:]
labels_train = input_labels[:cutoff]
labels_test = input_labels[cutoff:]

Play using inverse-STFT¶

#Selecting random index from test dataset

ind = int(np.random.uniform()*len(inputs_train))

#Displaying sample spectrogram and audio from test dataset
X = inputs_train[ind]
y = labels_train[ind].argmax()
print("Label :", textLabel(y) )
plt.imshow(X, cmap='hot', interpolation='nearest', aspect='auto')
plt.show()
audio = librosa.core.istft(X, hop_length=hop_len, win_length=win_len)

ipd.Audio(audio, rate=desired_sr, autoplay=True)

Label : siren

Model¶

model = tf.keras.Sequential()

# (-1, 126, 65)  = inputs_train.shape. Replace the numbers from inputs_train, if it changes.
# 126 = int(desired_samples/hop_len + 1), = time axis
# 65  = int(fft_len/2 + 1) = freq bins

lambda1 = tf.keras.layers.Lambda(lambda x: tf.reshape(x, (-1, int(fft_len/2 + 1), int(desired_samples/hop_len + 1), 1)), 
                                 name="add_channels", input_shape=(None, int(fft_len/2 + 1), int(desired_samples/hop_len + 1)))
conv2d1 = tf.keras.layers.Conv2D(16, (int(fft_len/2 + 1), 4), strides=1, activation='relu', name="conv1", 
                                 input_shape=(int(fft_len/2 + 1), int(desired_samples/hop_len + 1), 1))
conv2d2 = tf.keras.layers.Conv2D(32, (1, 4), strides=4, activation='relu', name="conv2")
conv2d3 = tf.keras.layers.Conv2D(64, (1, 4), strides=3, activation='relu', name="conv3")
flatten1 = tf.keras.layers.Flatten()
dense1  = tf.keras.layers.Dense(6*n_classes)
dense2  = tf.keras.layers.Dense(n_classes)
dropout = tf.keras.layers.Dropout(0.2)
activation1 = tf.keras.layers.Activation('softmax')

model.add(lambda1)
model.add(conv2d1)
model.add(conv2d2)
model.add(conv2d3)
model.add(flatten1)
model.add(dense1)
model.add(dense2) 
model.add(dropout)
model.add(activation1)

model.compile(optimizer=tf.keras.optimizers.Adam(0.01), metrics=['accuracy'],
              loss=(tf.keras.losses.binary_crossentropy if (n_classes==2) else tf.keras.losses.categorical_crossentropy))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
add_channels (Lambda)        (None, 1025, 126, 1)      0         
_________________________________________________________________
conv1 (Conv2D)               (None, 1, 123, 16)        65616     
_________________________________________________________________
conv2 (Conv2D)               (None, 1, 30, 32)         2080      
_________________________________________________________________
conv3 (Conv2D)               (None, 1, 9, 64)          8256      
_________________________________________________________________
flatten (Flatten)            (None, 576)               0         
_________________________________________________________________
dense (Dense)                (None, 24)                13848     
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 100       
_________________________________________________________________
dropout (Dropout)            (None, 4)                 0         
_________________________________________________________________
activation (Activation)      (None, 4)                 0         
=================================================================
Total params: 89,900
Trainable params: 89,900
Non-trainable params: 0
_________________________________________________________________

callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, restore_best_weights = True)]
model.fit(inputs_train, labels_train, batch_size=64, epochs=1024, callbacks=callbacks, validation_data=(inputs_test, labels_test))

Epoch 1/1024
WARNING:tensorflow:Model was constructed with shape (None, None, 1025, 126) for input Tensor("add_channels_input:0", shape=(None, None, 1025, 126), dtype=float32), but it was called on an input with incompatible shape (None, 1025, 126).
WARNING:tensorflow:Model was constructed with shape (None, None, 1025, 126) for input Tensor("add_channels_input:0", shape=(None, None, 1025, 126), dtype=float32), but it was called on an input with incompatible shape (None, 1025, 126).
43/43 [==============================] - ETA: 0s - loss: 0.9379 - accuracy: 0.5650WARNING:tensorflow:Model was constructed with shape (None, None, 1025, 126) for input Tensor("add_channels_input:0", shape=(None, None, 1025, 126), dtype=float32), but it was called on an input with incompatible shape (None, 1025, 126).
43/43 [==============================] - 1s 22ms/step - loss: 0.9379 - accuracy: 0.5650 - val_loss: 0.6166 - val_accuracy: 0.7459
Epoch 2/1024
43/43 [==============================] - 1s 16ms/step - loss: 0.6351 - accuracy: 0.7517 - val_loss: 0.4043 - val_accuracy: 0.8350
Epoch 3/1024
43/43 [==============================] - 1s 15ms/step - loss: 0.5501 - accuracy: 0.7990 - val_loss: 0.5683 - val_accuracy: 0.7987
Epoch 4/1024
43/43 [==============================] - 1s 14ms/step - loss: 0.4738 - accuracy: 0.8134 - val_loss: 0.3185 - val_accuracy: 0.9109
Epoch 5/1024
43/43 [==============================] - 1s 13ms/step - loss: 0.3944 - accuracy: 0.8494 - val_loss: 0.2955 - val_accuracy: 0.9076
Epoch 6/1024
43/43 [==============================] - 1s 14ms/step - loss: 0.3552 - accuracy: 0.8677 - val_loss: 0.2083 - val_accuracy: 0.9439
Epoch 7/1024
43/43 [==============================] - 1s 14ms/step - loss: 0.3706 - accuracy: 0.8564 - val_loss: 0.2224 - val_accuracy: 0.9208
Epoch 8/1024
43/43 [==============================] - 1s 13ms/step - loss: 0.3403 - accuracy: 0.8758 - val_loss: 0.2083 - val_accuracy: 0.9340
Epoch 9/1024
43/43 [==============================] - 1s 13ms/step - loss: 0.2900 - accuracy: 0.8883 - val_loss: 0.1506 - val_accuracy: 0.9373
Epoch 10/1024
43/43 [==============================] - 1s 13ms/step - loss: 0.2565 - accuracy: 0.8975 - val_loss: 0.1324 - val_accuracy: 0.9571
Epoch 11/1024
43/43 [==============================] - 1s 13ms/step - loss: 0.2511 - accuracy: 0.8993 - val_loss: 0.1609 - val_accuracy: 0.9406
Epoch 12/1024
43/43 [==============================] - 1s 13ms/step - loss: 0.2937 - accuracy: 0.8993 - val_loss: 0.1459 - val_accuracy: 0.9340
Epoch 13/1024
43/43 [==============================] - 1s 13ms/step - loss: 0.2351 - accuracy: 0.9037 - val_loss: 0.0985 - val_accuracy: 0.9571
Epoch 14/1024
43/43 [==============================] - 1s 13ms/step - loss: 0.2477 - accuracy: 0.9085 - val_loss: 0.2036 - val_accuracy: 0.9340
Epoch 15/1024
43/43 [==============================] - 1s 13ms/step - loss: 0.2346 - accuracy: 0.9144 - val_loss: 0.2246 - val_accuracy: 0.9538
Epoch 16/1024
43/43 [==============================] - 1s 13ms/step - loss: 0.2447 - accuracy: 0.9111 - val_loss: 0.3216 - val_accuracy: 0.9208
Epoch 17/1024
43/43 [==============================] - 1s 13ms/step - loss: 0.2280 - accuracy: 0.9111 - val_loss: 0.2869 - val_accuracy: 0.9505
Epoch 18/1024
43/43 [==============================] - 1s 13ms/step - loss: 0.2233 - accuracy: 0.9107 - val_loss: 0.1211 - val_accuracy: 0.9505
Epoch 19/1024
43/43 [==============================] - 1s 13ms/step - loss: 0.1999 - accuracy: 0.9203 - val_loss: 0.1205 - val_accuracy: 0.9472
Epoch 20/1024
43/43 [==============================] - 1s 13ms/step - loss: 0.2741 - accuracy: 0.8938 - val_loss: 0.2565 - val_accuracy: 0.9406
Epoch 21/1024
43/43 [==============================] - 1s 13ms/step - loss: 0.2008 - accuracy: 0.9162 - val_loss: 0.2171 - val_accuracy: 0.9571
Epoch 22/1024
43/43 [==============================] - 1s 13ms/step - loss: 0.1807 - accuracy: 0.9170 - val_loss: 0.2516 - val_accuracy: 0.9472
Epoch 23/1024
43/43 [==============================] - 1s 13ms/step - loss: 0.2411 - accuracy: 0.9217 - val_loss: 0.4565 - val_accuracy: 0.8647
Epoch 24/1024
43/43 [==============================] - 1s 14ms/step - loss: 0.3067 - accuracy: 0.8916 - val_loss: 0.1340 - val_accuracy: 0.9472
Epoch 25/1024
43/43 [==============================] - 1s 13ms/step - loss: 0.2056 - accuracy: 0.9166 - val_loss: 0.1183 - val_accuracy: 0.9538
Epoch 26/1024
43/43 [==============================] - 1s 13ms/step - loss: 0.1822 - accuracy: 0.9262 - val_loss: 0.1687 - val_accuracy: 0.9439
Epoch 27/1024
43/43 [==============================] - 1s 14ms/step - loss: 0.1998 - accuracy: 0.9177 - val_loss: 0.2403 - val_accuracy: 0.9439
Epoch 28/1024
43/43 [==============================] - 1s 13ms/step - loss: 0.1575 - accuracy: 0.9350 - val_loss: 0.1175 - val_accuracy: 0.9505
Epoch 29/1024
43/43 [==============================] - 1s 13ms/step - loss: 0.2178 - accuracy: 0.9225 - val_loss: 0.1676 - val_accuracy: 0.9175
Epoch 30/1024
43/43 [==============================] - 1s 13ms/step - loss: 0.1673 - accuracy: 0.9221 - val_loss: 0.2031 - val_accuracy: 0.9142
Epoch 31/1024
43/43 [==============================] - 1s 13ms/step - loss: 0.1505 - accuracy: 0.9342 - val_loss: 0.1702 - val_accuracy: 0.9571
Epoch 32/1024
43/43 [==============================] - 1s 13ms/step - loss: 0.1392 - accuracy: 0.9401 - val_loss: 0.1252 - val_accuracy: 0.9571
Epoch 33/1024
43/43 [==============================] - 1s 13ms/step - loss: 0.1362 - accuracy: 0.9416 - val_loss: 0.2084 - val_accuracy: 0.9373

<tensorflow.python.keras.callbacks.History at 0x7ff7040eca90>

model.evaluate(inputs_test, labels_test)

10/10 [==============================] - 0s 6ms/step - loss: 0.0985 - accuracy: 0.9571

[0.09853188693523407, 0.9570956826210022]

Prediction¶

On random sample¶

ind = int(np.random.uniform()*len(inputs_test))
spectrogram_out = inputs_test[ind]

ipd.Audio(spectrogram_out, rate=desired_sr)

y = labels_test[ind]
output = model.predict(np.expand_dims(np.array([inputs_test[ind]]), 0))

print("True label:", textLabel(np.argmax(y)))
print("Prediction:", textLabel(np.argmax(output)))

plt.imshow(spectrogram_out, cmap='hot', interpolation='nearest', aspect='auto')
plt.show()

audio = librosa.core.istft(spectrogram_out, hop_length=hop_len, win_length=win_len)
playback_sr, audio_interpolated = interpolateAudio(audio)
ipd.Audio(audio_interpolated, rate=playback_sr, autoplay=True)

True label: jackhammer
Prediction: jackhammer

Confusion matrix¶

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

predictions_test = model.predict(inputs_test)
cm = confusion_matrix(labels_test.argmax(axis=1), predictions_test.argmax(axis=1))

tck_labels = hotwords+["noise"] if ( sampleBackGround() ) else hotwords
sns.heatmap(cm, xticklabels=tck_labels, yticklabels=tck_labels, annot=True,fmt="d",)
plt.title("Validation Confusion Matrix")
plt.ylabel("A C T U A L")
plt.xlabel("P R E D I C T E D")
plt.show()

WARNING:tensorflow:Model was constructed with shape (None, None, 1025, 126) for input Tensor("add_channels_input:0", shape=(None, None, 1025, 126), dtype=float32), but it was called on an input with incompatible shape (None, 1025, 126).

On background and silence¶

background = spectrogramOp(np.random.random((desired_samples)))
silence = spectrogramOp(np.zeros((desired_samples)))
background_out, silence_out = model.predict(np.array([background, silence]))
print("Predicted ", textLabel(background_out.argmax()), "on random audio with vector", background_out)
print("Predicted ", textLabel(silence_out.argmax()), "on null audio with vector", silence_out)

Predicted  jackhammer on random audio with vector [1. 0. 0. 0.]
Predicted  siren on null audio with vector [0.0139203  0.1483562  0.7802881  0.05743541]

On longer clips¶

n = 4 # number of samples

X, y = [], []
top_dir = "audio"

for i in range(n):
    i = int(np.random.uniform()*len(df))
    file_path = file_path = top_dir + "/fold" + str(df.iloc[i]['fold']) + "/" + str(df.iloc[i]['slice_file_name'])
    
    audio, sr = librosa.core.load(file_path, sr=desired_sr, mono=True)
    audio, interval = librosa.effects.trim(audio, top_db=30)
    audio = np.pad(audio, int((desired_samples-audio.shape[0])/2))
    X = np.append(X, audio)
    y.append(df.iloc[i]['class'])


X = np.pad(X, (0, desired_samples - (X.shape[0]%desired_samples)))

assert sr == desired_sr #ensure sample rate is same as desired
assert len(X.shape) == 1 #ensure X is a mono signal

spectrogram_out = spectrogramOp(X)
plt.imshow(spectrogram_out, cmap='hot', interpolation='nearest', aspect='auto')
plt.show()

print("The true labels are:", ', '.join(y))

ipd.Audio(X, rate=sr, autoplay=True)

The true labels are: gun_shot, street_music, engine_idling, children_playing

tstep = 0.05 # predict every 0.2 sec
twin  = 2.0 # length of word
stride_len = int(desired_sr * tstep) 
twin_len   = int(desired_sr * twin)
activating_times = []
times = []
predictions = []
labels = []
thr = 0.8+(1.0-0.8)/n_classes
for n in range(0, len(X), stride_len):
  X_wind = X[n:n + twin_len]
  X_wind = np.pad(X_wind, int((desired_samples-X_wind.shape[0])/2))
  spectrogram_out = np.array([spectrogramOp(X_wind)])
  test_pred = model.predict(spectrogram_out)
  test_label = test_pred.argmax() 
  pred_prob = test_pred[0][test_label]
  pred_trustworthy = pred_prob > thr

  labels.append((test_label + 1 if (test_label < n_classes and pred_trustworthy) else 0)/4)
  prediction = test_pred[0][test_label] if (test_label < n_classes and pred_trustworthy) else 0
  time = n + 0.5*twin_len

  predictions.append(prediction)
  if (test_label < n_classes and pred_trustworthy):
    activating_times.append(time)

  times.append(time)
  #print(time, "-", n+twin_len, ":", textLabel(test_label), pred_trustworthy, pred_prob)

#Visualizing with matplotlib
fig, ax1 = plt.subplots(1,1,figsize=(20,8))
fig.patch.set_facecolor('#111111')
ax1.patch.set_facecolor('#111111')

green_color = "#0f0"
red_color = "#f00"
yellow_color = "#ff0"
ax1.set_xlabel("time windows", color=green_color)
ax1.set_ylabel("wake word detection prob.", color=green_color)

start_time = 0
end_time = 0

start_index = times.index(start_time) if start_time in times else 0
end_index = times.index(end_time) if end_time in times else len(times)

ax1.tick_params(axis='x', labelcolor=green_color)
ax1.tick_params(axis='y', labelcolor=green_color)

for i in range(n_classes):
    ax1.axhline((i+1)/n_classes, color='green', lw=2, alpha=0.5)
    plt.text(1, (i+1)/n_classes, textLabel(i), fontsize=12, va='center', ha='left', color='white')

for t in activating_times:
    ax1.axvline(t, color="#fff", alpha=0.2)

ax1.scatter(times[start_index:end_index], 
  predictions[start_index:end_index], color=red_color, alpha=1.0, marker='X')

X_time_axis = np.arange(times[start_index], times[end_index-1], (times[end_index-1]-times[start_index])/len(X))
ax1.scatter(X_time_axis, X+0.6, color='#444', marker='.', alpha=0.1)

plt.step(times[start_index:end_index], 
  labels[start_index:end_index], color=yellow_color)

plt.show()

deepC¶

model.save("subModel")

WARNING:tensorflow:From /opt/tljh/user/lib/python3.7/site-packages/tensorflow/python/training/tracking/tracking.py:111: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
WARNING:tensorflow:From /opt/tljh/user/lib/python3.7/site-packages/tensorflow/python/training/tracking/tracking.py:111: Layer.updates (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: subModel/assets

#Testing whether submodel works
tf_model = tf.saved_model.load("./subModel")
infer = tf_model.signatures["serving_default"]
input_val = np.full((1, inputs_train.shape[1], inputs_train.shape[2]), 3e-3)
input_val = np.zeros((1, inputs_train.shape[1], inputs_train.shape[2]))
output = infer(tf.constant(input_val, dtype=tf.float32))
tf_out = output[list(output.keys())[0]].numpy().flatten()[0]
print("tf out", tf_out)

tf out 0.0139203165

#!deepCC --profile ./subModel_deepC/output.csv --format tensorflow ./subModel --debug
!deepCC --format tensorflow ./subModel --board='Arduino Nano 33 BLE Sense' --archive --debug

[INFO]
Reading [tensorflow model] './subModel'
[SUCCESS]
Saved 'subModel_deepC/subModel.onnx'
[INFO]
Reading [onnx model] 'subModel_deepC/subModel.onnx'
[INFO]
Model info:
  ir_vesion : 4
  doc       : 
[WARNING]
[ONNX]: terminal (input/output) add_channels_input_0's shape is less than 1. Changing it to 1.
[WARNING]
[ONNX]: terminal (input/output) add_channels_input_0's shape is less than 1. Changing it to 1.
[WARNING]
[ONNX]: terminal (input/output) Identity_0's shape is less than 1. Changing it to 1.
[INFO]
Running DNNC graph sanity check ...
[SUCCESS]
Passed sanity check.
[INFO]
Writing C++ file 'subModel_deepC/subModel.cpp'
[INFO]
deepSea model files are ready in 'subModel_deepC/' 
[RUNNING COMMAND]
arm-none-eabi-g++ -std=c++11 -O3 -mcpu=cortex-m4 -specs=nosys.specs -mthumb -fno-exceptions -fno-rtti -msoft-float -mfloat-abi=softfp -I. -I/opt/tljh/user/lib/python3.7/site-packages/deepC-0.13-py3.7-linux-x86_64.egg/deepC/include -I /opt/tljh/user/lib/python3.7/site-packages/deepC-0.13-py3.7-linux-x86_64.egg/deepC/packages/eigen-eigen-323c052e1731 -c "subModel_deepC/subModel.cpp" -o "subModel_deepC/subModel.o"
[RUNNING COMMAND]
arm-none-eabi-ar rcs "subModel_deepC/lib_subModel.a" "subModel_deepC/subModel.o"
[RUNNING COMMAND]
size "subModel_deepC/lib_subModel.a"
   text	   data	    bss	    dec	    hex	filename
 425749	      4	     96	 425849	  67f79	subModel.o (ex subModel_deepC/lib_subModel.a)
[SUCCESS]
Saved model as archive "subModel_deepC/lib_subModel.a"
[DEBUG]
Intermediate files won't be removed.

Now let's see another set of applications of everyday sound classification notebook below.¶

Everyday sound classification¶

Common Sounds¶

Identify the different sounds in your surrounding.

certain sounds in our surrounding may require a apredeined action to be performed on hearing them. Using deep learning to identify them can be used to trigger such actions automatically.

The dataset¶

On Kaggle by Marc Moreaux

The dataset contains 40 labeled sound excerpts of 5 seconds each from 50 diferent sound classes, making it a total of 2000 audio samples in the dataset. The dataset was intially proposed by the authors of ESC-50: Dataset for Environmental Sound Classification and further processed by the Kaggle dataset authors mentioned above In addition to the sound excerpts, a CSV file containing metadata about each excerpt is also provided.

!wget -O archive.zip "https://storage.googleapis.com/kaggle-data-sets/3151/142598/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20210321%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20210321T182101Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=host&X-Goog-Signature=61f1dffc7f266434978a91c075d9b5d4ef30e0bddd7940e6f194a55db080f57eaa043aa19da288a9aba829a6c52afba060ccc0ab07bfe1ca19dec1c27b199f09a7317389a75bdfbfd1f74b0b9728daf4e6f74c7873b2351a956f867d30706ac73968523fb97aa3030ac7e340b97c1751bb808cadeadebd42f793dd1b5041b3e228534aacd8246af5f2f5840e2cba0c8de5051e74bafe41ea1d0d1ccb6e5ad6dee4a57329e1f8ccc0c32255e05076c4a6cab3abfcd1823b7b18b996a6817da8b68183fa192f1665b0f90dd40eb920220c2415c43ef55eaed499dd2549b910d21188aba08ce51912d49c6dda787781884978954f7a9c67036e9d5232ebc85fb49c"

!unzip -q archive.zip -d audio

!rm archive.zip

--2021-03-21 18:40:43--  https://storage.googleapis.com/kaggle-data-sets/3151/142598/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20210321%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20210321T182101Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=host&X-Goog-Signature=61f1dffc7f266434978a91c075d9b5d4ef30e0bddd7940e6f194a55db080f57eaa043aa19da288a9aba829a6c52afba060ccc0ab07bfe1ca19dec1c27b199f09a7317389a75bdfbfd1f74b0b9728daf4e6f74c7873b2351a956f867d30706ac73968523fb97aa3030ac7e340b97c1751bb808cadeadebd42f793dd1b5041b3e228534aacd8246af5f2f5840e2cba0c8de5051e74bafe41ea1d0d1ccb6e5ad6dee4a57329e1f8ccc0c32255e05076c4a6cab3abfcd1823b7b18b996a6817da8b68183fa192f1665b0f90dd40eb920220c2415c43ef55eaed499dd2549b910d21188aba08ce51912d49c6dda787781884978954f7a9c67036e9d5232ebc85fb49c
Resolving storage.googleapis.com (storage.googleapis.com)... 216.58.203.16, 142.250.67.144, 142.250.67.176, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|216.58.203.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1528150051 (1.4G) [application/zip]
Saving to: ‘archive.zip’

archive.zip         100%[===================>]   1.42G  33.5MB/s    in 63s     

2021-03-21 18:41:47 (23.0 MB/s) - ‘archive.zip’ saved [1528150051/1528150051]

df = pd.read_csv('audio/esc50.csv')
df

df['category'].value_counts()

pouring_water       40
clock_alarm         40
toilet_flush        40
crow                40
can_opening         40
clock_tick          40
engine              40
hen                 40
glass_breaking      40
church_bells        40
breathing           40
drinking_sipping    40
train               40
helicopter          40
crickets            40
airplane            40
car_horn            40
keyboard_typing     40
cow                 40
door_wood_knock     40
siren               40
coughing            40
laughing            40
crying_baby         40
chirping_birds      40
wind                40
dog                 40
sneezing            40
mouse_click         40
crackling_fire      40
footsteps           40
water_drops         40
fireworks           40
brushing_teeth      40
vacuum_cleaner      40
door_wood_creaks    40
rooster             40
hand_saw            40
frog                40
pig                 40
cat                 40
snoring             40
insects             40
thunderstorm        40
washing_machine     40
clapping            40
chainsaw            40
sea_waves           40
sheep               40
rain                40
Name: category, dtype: int64

max_length = 5 #length (in seconds) of input
desired_sr = 16000 #sampling rate to use
                   # NOTE: downsampling it to 4000 results in significant accuracy drop to 60%
mic_sr = 16000 # rate supported by Sampling library like PDM
desired_samples = max_length*desired_sr #total number of samples in input

tf.random.set_seed(0)
np.random.seed(0)
os.environ['PYTHONHASHSEED'] = '0'

Utility functions¶

#Processing the data
dataset_min = 0.0
dataset_max = 1.0

def denormalize_dataset(input_val):
  global dataset_min, dataset_max
  return input_val * (dataset_max - dataset_min)

#Function to normalize input values
def normalize_dataset(input_val):
  global dataset_min, dataset_max
  dataset_min = np.min(input_val) 
  dataset_max = np.max(input_val) 

  diff = dataset_max - dataset_min
  if (diff != 0):
    input_val /= diff
  return input_val

def interpolateAudio(audio):
    factor = float(mic_sr)/desired_sr
    x_interp_values = []
    for i in range(len(audio)):
        x_interp_values.append(int(factor*i))
    audio_interpolated = np.interp(range(int(len(audio)*factor)), x_interp_values, audio)

    return mic_sr, audio_interpolated

Populate datasets¶

Picking the hotwords¶

word_dirs = list(set(df['category'].to_list()))
hotwords = ['crackling_fire', 'glass_breaking', 'siren', 'fireworks']

print("All words in dataset - \n", ', '.join(word_dirs))
print("\nHotwords - \n", ', '.join(hotwords))

All words in dataset - 
 rain, crying_baby, laughing, chainsaw, clapping, washing_machine, coughing, thunderstorm, door_wood_knock, cow, crackling_fire, keyboard_typing, insects, car_horn, snoring, siren, airplane, crickets, train, clock_alarm, cat, drinking_sipping, breathing, pig, church_bells, glass_breaking, frog, hand_saw, hen, rooster, engine, door_wood_creaks, clock_tick, vacuum_cleaner, brushing_teeth, fireworks, can_opening, crow, toilet_flush, chirping_birds, water_drops, wind, dog, footsteps, sneezing, mouse_click, sheep, helicopter, sea_waves, pouring_water

Hotwords - 
 crackling_fire, glass_breaking, siren, fireworks

Handling labels and background samples¶

add_noise = False # add different words, null samples and random noise
n_classes = len(hotwords) + int(add_noise) 

class_nSamples = 1000
other_nSamples = float(class_nSamples)/(len(word_dirs) - n_classes)

def nLabel(word):
    return n_classes-1 if ( word not in hotwords ) else hotwords.index(word)

def textLabel(index):
    return hotwords[index] if index <len(hotwords) else "background"

def sampleBackGround():
    return add_noise

Creating the dataset¶

#Dataset storing audio samples for wake word and background

top_dir = 'audio/audio/audio'

input_audio   = np.empty((0, desired_samples)).astype(np.float32)
input_labels  = np.empty((0)).astype(np.int32); # index of the word in hotwords list is the lable.

for word in (word_dirs) :
    print("\n",word)
    
    if ( word not in hotwords and False == sampleBackGround()) : # background, do not include
        print("-- Background/noise/other words not included")
        continue
        
    else: # to be included
        dfx = df[df['category'] == word]
        start_time = time.time()

        wav_files = 0

        word_samples = np.empty((0, desired_samples))
        
        if word in hotwords: # hotwords
            print("-- Category : hotword")
            
            for i in range(len(dfx)):
                file_path = top_dir + "/" + dfx.iloc[i]['filename']

                X_sub = np.empty((0, desired_samples))
                X, sr = librosa.core.load(file_path, sr=desired_sr)
                X, interval = librosa.effects.trim(X)

                if X.shape[0] < desired_sr: # if samples less than 1 second
                    continue

                if X.shape[0]%desired_samples != 0: # if it needs padding, else, there will be unnecessary silence appended
                    X = np.pad(X, (0, desired_samples - (X.shape[0]%desired_samples)))
                
                X_sub = np.array(np.split(X, int(X.shape[0]*1.0/desired_samples)))
                
                word_samples = np.append(word_samples, X_sub, axis=0)

                if ( word_samples.shape[0] > class_nSamples ):
                    break

                wav_files = wav_files + 1
            
        else:
            print("-- Category : backgound/noise/other words")

            for i in range(len(dfx)):
                file_path = top_dir + "/" + dfx.iloc[i]['filename']
                X_sub = np.empty((0, desired_samples))
                X, sr = librosa.core.load(file_path, sr=desired_sr)
                X, interval = librosa.effects.trim(X)

                if X.shape[0] < desired_sr: # if samples less than 1 second
                    continue

                if X.shape[0]%desired_samples != 0: # if it needs padding, else, there will be unnecessary silence appended
                    X = np.pad(X, (0, desired_samples - (X.shape[0]%desired_samples)))
                
                X_sub = np.array(np.split(X, int(X.shape[0]*1.0/desired_samples)))
                
                word_samples = np.append(word_samples, X_sub, axis=0)

                if ( word_samples.shape[0] > other_nSamples ):
                    break
                
                wav_files = wav_files + 1
            
        if ( word_samples.size > 0 ):
            input_audio = np.concatenate((input_audio, word_samples), axis=0)
            labels = np.full((word_samples.shape[0]), nLabel(word))
            input_labels = np.concatenate((input_labels, labels))

            print("added {} audio files with {} samples for word \"{}\" with label {} in {:.1f} sec.".
                  format(wav_files, labels.shape[0], word, nLabel(word), (time.time() - start_time)))

 rain
-- Background/noise/other words not included

 crying_baby
-- Background/noise/other words not included

 laughing
-- Background/noise/other words not included

 chainsaw
-- Background/noise/other words not included

 clapping
-- Background/noise/other words not included

 washing_machine
-- Background/noise/other words not included

 coughing
-- Background/noise/other words not included

 thunderstorm
-- Background/noise/other words not included

 door_wood_knock
-- Background/noise/other words not included

 cow
-- Background/noise/other words not included

 crackling_fire
-- Category : hotword
added 40 audio files with 40 samples for word "crackling_fire" with label 0 in 7.9 sec.

 keyboard_typing
-- Background/noise/other words not included

 insects
-- Background/noise/other words not included

 car_horn
-- Background/noise/other words not included

 snoring
-- Background/noise/other words not included

 siren
-- Category : hotword
added 40 audio files with 40 samples for word "siren" with label 2 in 7.3 sec.

 airplane
-- Background/noise/other words not included

 crickets
-- Background/noise/other words not included

 train
-- Background/noise/other words not included

 clock_alarm
-- Background/noise/other words not included

 cat
-- Background/noise/other words not included

 drinking_sipping
-- Background/noise/other words not included

 breathing
-- Background/noise/other words not included

 pig
-- Background/noise/other words not included

 church_bells
-- Background/noise/other words not included

 glass_breaking
-- Category : hotword
added 27 audio files with 27 samples for word "glass_breaking" with label 1 in 7.2 sec.

 frog
-- Background/noise/other words not included

 hand_saw
-- Background/noise/other words not included

 hen
-- Background/noise/other words not included

 rooster
-- Background/noise/other words not included

 engine
-- Background/noise/other words not included

 door_wood_creaks
-- Background/noise/other words not included

 clock_tick
-- Background/noise/other words not included

 vacuum_cleaner
-- Background/noise/other words not included

 brushing_teeth
-- Background/noise/other words not included

 fireworks
-- Category : hotword
added 40 audio files with 40 samples for word "fireworks" with label 3 in 7.3 sec.

 can_opening
-- Background/noise/other words not included

 crow
-- Background/noise/other words not included

 toilet_flush
-- Background/noise/other words not included

 chirping_birds
-- Background/noise/other words not included

 water_drops
-- Background/noise/other words not included

 wind
-- Background/noise/other words not included

 dog
-- Background/noise/other words not included

 footsteps
-- Background/noise/other words not included

 sneezing
-- Background/noise/other words not included

 mouse_click
-- Background/noise/other words not included

 sheep
-- Background/noise/other words not included

 helicopter
-- Background/noise/other words not included

 sea_waves
-- Background/noise/other words not included

 pouring_water
-- Background/noise/other words not included

# Concatenating dataset into matrix of inputs and labels

onehot_labels = np.zeros((input_labels.size, n_classes)).astype(np.int32)
onehot_labels[np.arange(input_labels.size), input_labels] = 1

input_labels = onehot_labels
print("Input dataset size:", input_audio.shape)
print("Input targets size:", input_labels.shape)

Input dataset size: (147, 80000)
Input targets size: (147, 4)

Adding noise/silence as background¶

# Add 10% of random noise and 10% of silent samples as background.
if ( sampleBackGround() ) :
    n_bg_samples = int(other_nSamples)

    bg_labels    = np.zeros((n_bg_samples, n_classes)).astype(np.int)
    bg_labels[:,n_classes-1] = 1

    silence = np.zeros((n_bg_samples, desired_samples))
    input_audio = np.append(input_audio, silence, axis=0)
    input_labels = np.append(input_labels, bg_labels, axis=0)
    
    background = np.zeros((n_bg_samples, desired_samples))
    input_audio = np.append(input_audio, background, axis=0)
    input_labels = np.append(input_labels, bg_labels, axis=0)

#     %xdel background
#     %xdel silence
#     %xdel bg_labels

Extracting STFT features¶

# hop_len=int(win_len/4) # default
# fft_len=pow(2, int(np.log2(win_len)+1))
fft_len = 2048
win_len = fft_len
hop_len = int(win_len/4)

def spectrogramOp(X):
  # STFT returns np.ndarray of shape=(1 + fft_len/2, t)
  spectrogram_out = librosa.core.stft(X, n_fft=fft_len, hop_length=hop_len, win_length=win_len, center=True)
#  spectrogram_out = np.swapaxes(np.abs(spectrogram_out), 0, 1)
  return np.absolute(spectrogram_out)

#inputs = np.array([spectrogramOp(input) for input in input_audio])
input_spectrogram = np.empty((input_audio.shape[0], int(fft_len/2 + 1), int(desired_samples/hop_len + 1))).astype(np.float32)

i = 0 ;
for input in input_audio:
    input_spectrogram[i] = spectrogramOp(input) 
    i = i +  1
print("input dataset size:", input_spectrogram.shape)

input dataset size: (147, 1025, 157)

Normalize¶

# Normalize
input_spectrogram = normalize_dataset(input_spectrogram)

Train-test split¶

total_len = input_labels.shape[0]

#Shuffling inputs and labels
shuffle_permutation = np.arange(total_len)
np.random.shuffle(shuffle_permutation)

input_spectrogram = input_spectrogram[shuffle_permutation]
input_labels = input_labels[shuffle_permutation]

#Splitting into train and test dataset - 90-10 ratio
train_split = 0.9
cutoff = int(train_split*total_len)

inputs_train = input_spectrogram[:cutoff]
inputs_test = input_spectrogram[cutoff:]
labels_train = input_labels[:cutoff]
labels_test = input_labels[cutoff:]

Play using inverse-STFT¶

#Selecting random index from test dataset

ind = int(np.random.uniform()*len(inputs_train))

#Displaying sample spectrogram and audio from test dataset
X = inputs_train[ind]
y = labels_train[ind].argmax()
print("Label :", textLabel(y) )
plt.imshow(X, cmap='hot', interpolation='nearest', aspect='auto')
plt.show()
audio = librosa.core.istft(X, hop_length=hop_len, win_length=win_len)

ipd.Audio(audio, rate=desired_sr, autoplay=True)

Label : crackling_fire

Model¶

model = tf.keras.Sequential()

# (-1, 126, 65)  = inputs_train.shape. Replace the numbers from inputs_train, if it changes.
# 126 = int(desired_samples/hop_len + 1), = time axis
# 65  = int(fft_len/2 + 1) = freq bins

lambda1 = tf.keras.layers.Lambda(lambda x: tf.reshape(x, (-1, int(fft_len/2 + 1), int(desired_samples/hop_len + 1), 1)), 
                                 name="add_channels", input_shape=(None, int(fft_len/2 + 1), int(desired_samples/hop_len + 1)))
conv2d1 = tf.keras.layers.Conv2D(16, (int(fft_len/2 + 1), 4), strides=1, activation='relu', name="conv1", 
                                 input_shape=(int(fft_len/2 + 1), int(desired_samples/hop_len + 1), 1))
conv2d2 = tf.keras.layers.Conv2D(32, (1, 4), strides=4, activation='relu', name="conv2")
conv2d3 = tf.keras.layers.Conv2D(64, (1, 4), strides=3, activation='relu', name="conv3")
flatten1 = tf.keras.layers.Flatten()
dense1  = tf.keras.layers.Dense(6*n_classes)
dense2  = tf.keras.layers.Dense(n_classes)
dropout = tf.keras.layers.Dropout(0.2)
activation1 = tf.keras.layers.Activation('softmax')

model.add(lambda1)
model.add(conv2d1)
model.add(conv2d2)
model.add(conv2d3)
model.add(flatten1)
model.add(dense1)
model.add(dense2) 
model.add(dropout)
model.add(activation1)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
add_channels (Lambda)        (None, 1025, 157, 1)      0         
_________________________________________________________________
conv1 (Conv2D)               (None, 1, 154, 16)        65616     
_________________________________________________________________
conv2 (Conv2D)               (None, 1, 38, 32)         2080      
_________________________________________________________________
conv3 (Conv2D)               (None, 1, 12, 64)         8256      
_________________________________________________________________
flatten (Flatten)            (None, 768)               0         
_________________________________________________________________
dense (Dense)                (None, 24)                18456     
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 100       
_________________________________________________________________
dropout (Dropout)            (None, 4)                 0         
_________________________________________________________________
activation (Activation)      (None, 4)                 0         
=================================================================
Total params: 94,508
Trainable params: 94,508
Non-trainable params: 0
_________________________________________________________________

callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, restore_best_weights = True)]
model.compile(optimizer=tf.keras.optimizers.Adam(0.01), metrics=['accuracy'],
              loss=(tf.keras.losses.binary_crossentropy if (n_classes==2) else tf.keras.losses.categorical_crossentropy))

model.fit(inputs_train, labels_train, batch_size=64, epochs=1024, callbacks=callbacks, validation_data=(inputs_test, labels_test))

Epoch 1/1024
WARNING:tensorflow:Model was constructed with shape (None, None, 1025, 157) for input Tensor("add_channels_input:0", shape=(None, None, 1025, 157), dtype=float32), but it was called on an input with incompatible shape (None, 1025, 157).
WARNING:tensorflow:Model was constructed with shape (None, None, 1025, 157) for input Tensor("add_channels_input:0", shape=(None, None, 1025, 157), dtype=float32), but it was called on an input with incompatible shape (None, 1025, 157).
1/3 [=========>....................] - ETA: 0s - loss: 1.3866 - accuracy: 0.2969WARNING:tensorflow:Model was constructed with shape (None, None, 1025, 157) for input Tensor("add_channels_input:0", shape=(None, None, 1025, 157), dtype=float32), but it was called on an input with incompatible shape (None, 1025, 157).
3/3 [==============================] - 0s 76ms/step - loss: 1.4048 - accuracy: 0.2652 - val_loss: 1.2725 - val_accuracy: 0.2667
Epoch 2/1024
3/3 [==============================] - 0s 16ms/step - loss: 1.2331 - accuracy: 0.2424 - val_loss: 1.1317 - val_accuracy: 0.3333
Epoch 3/1024
3/3 [==============================] - 0s 16ms/step - loss: 1.1369 - accuracy: 0.4242 - val_loss: 1.0791 - val_accuracy: 0.4667
Epoch 4/1024
3/3 [==============================] - 0s 16ms/step - loss: 0.9931 - accuracy: 0.5379 - val_loss: 0.9916 - val_accuracy: 0.4667
Epoch 5/1024
3/3 [==============================] - 0s 16ms/step - loss: 0.8173 - accuracy: 0.6136 - val_loss: 0.7839 - val_accuracy: 0.6000
Epoch 6/1024
3/3 [==============================] - 0s 16ms/step - loss: 0.7488 - accuracy: 0.6515 - val_loss: 0.5707 - val_accuracy: 0.8000
Epoch 7/1024
3/3 [==============================] - 0s 16ms/step - loss: 0.7482 - accuracy: 0.6894 - val_loss: 0.6775 - val_accuracy: 0.7333
Epoch 8/1024
3/3 [==============================] - 0s 17ms/step - loss: 0.6974 - accuracy: 0.7424 - val_loss: 0.5211 - val_accuracy: 0.8000
Epoch 9/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.6937 - accuracy: 0.7500 - val_loss: 0.6259 - val_accuracy: 0.6667
Epoch 10/1024
3/3 [==============================] - 0s 14ms/step - loss: 0.4865 - accuracy: 0.8030 - val_loss: 0.9033 - val_accuracy: 0.5333
Epoch 11/1024
3/3 [==============================] - 0s 17ms/step - loss: 0.4744 - accuracy: 0.7500 - val_loss: 1.1408 - val_accuracy: 0.6667
Epoch 12/1024
3/3 [==============================] - 0s 14ms/step - loss: 0.5415 - accuracy: 0.7197 - val_loss: 1.6311 - val_accuracy: 0.6667
Epoch 13/1024
3/3 [==============================] - 0s 13ms/step - loss: 0.6991 - accuracy: 0.7424 - val_loss: 1.2953 - val_accuracy: 0.6000
Epoch 14/1024
3/3 [==============================] - 0s 14ms/step - loss: 0.4434 - accuracy: 0.7879 - val_loss: 0.9311 - val_accuracy: 0.6000
Epoch 15/1024
3/3 [==============================] - 0s 14ms/step - loss: 0.4374 - accuracy: 0.8182 - val_loss: 0.6519 - val_accuracy: 0.7333
Epoch 16/1024
3/3 [==============================] - 0s 13ms/step - loss: 0.6224 - accuracy: 0.7727 - val_loss: 0.7354 - val_accuracy: 0.8000
Epoch 17/1024
3/3 [==============================] - 0s 16ms/step - loss: 0.2771 - accuracy: 0.8712 - val_loss: 1.1019 - val_accuracy: 0.7333
Epoch 18/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.7001 - accuracy: 0.8561 - val_loss: 1.4031 - val_accuracy: 0.6667
Epoch 19/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.4781 - accuracy: 0.8258 - val_loss: 1.3072 - val_accuracy: 0.8000
Epoch 20/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.2473 - accuracy: 0.9015 - val_loss: 1.4505 - val_accuracy: 0.8000
Epoch 21/1024
3/3 [==============================] - 0s 13ms/step - loss: 0.2605 - accuracy: 0.8864 - val_loss: 1.5345 - val_accuracy: 0.8000
Epoch 22/1024
3/3 [==============================] - 0s 13ms/step - loss: 0.2102 - accuracy: 0.9015 - val_loss: 1.6606 - val_accuracy: 0.7333
Epoch 23/1024
3/3 [==============================] - 0s 17ms/step - loss: 0.2229 - accuracy: 0.8939 - val_loss: 1.6373 - val_accuracy: 0.8000
Epoch 24/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.1433 - accuracy: 0.9470 - val_loss: 1.6226 - val_accuracy: 0.8000
Epoch 25/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.1288 - accuracy: 0.9167 - val_loss: 1.7251 - val_accuracy: 0.8667
Epoch 26/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.1572 - accuracy: 0.8788 - val_loss: 1.7386 - val_accuracy: 0.7333
Epoch 27/1024
3/3 [==============================] - 0s 14ms/step - loss: 0.0976 - accuracy: 0.9545 - val_loss: 1.7891 - val_accuracy: 0.7333
Epoch 28/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.0951 - accuracy: 0.9242 - val_loss: 1.7175 - val_accuracy: 0.7333

<tensorflow.python.keras.callbacks.History at 0x7f48be171780>

callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, restore_best_weights = True)]
model.compile(optimizer=tf.keras.optimizers.Adam(0.001), metrics=['accuracy'],
              loss=(tf.keras.losses.binary_crossentropy if (n_classes==2) else tf.keras.losses.categorical_crossentropy))

model.fit(inputs_train, labels_train, batch_size=64, epochs=1024, callbacks=callbacks, validation_data=(inputs_test, labels_test))

Epoch 1/1024
WARNING:tensorflow:Model was constructed with shape (None, None, 1025, 157) for input Tensor("add_channels_input:0", shape=(None, None, 1025, 157), dtype=float32), but it was called on an input with incompatible shape (None, 1025, 157).
WARNING:tensorflow:Model was constructed with shape (None, None, 1025, 157) for input Tensor("add_channels_input:0", shape=(None, None, 1025, 157), dtype=float32), but it was called on an input with incompatible shape (None, 1025, 157).
1/3 [=========>....................] - ETA: 0s - loss: 0.5525 - accuracy: 0.7656WARNING:tensorflow:Model was constructed with shape (None, None, 1025, 157) for input Tensor("add_channels_input:0", shape=(None, None, 1025, 157), dtype=float32), but it was called on an input with incompatible shape (None, 1025, 157).
3/3 [==============================] - 0s 53ms/step - loss: 0.5320 - accuracy: 0.8106 - val_loss: 0.4938 - val_accuracy: 0.7333
Epoch 2/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.4637 - accuracy: 0.8561 - val_loss: 0.5280 - val_accuracy: 0.7333
Epoch 3/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.3671 - accuracy: 0.9091 - val_loss: 0.5543 - val_accuracy: 0.7333
Epoch 4/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.3918 - accuracy: 0.8106 - val_loss: 0.5666 - val_accuracy: 0.7333
Epoch 5/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.3414 - accuracy: 0.8788 - val_loss: 0.5826 - val_accuracy: 0.7333
Epoch 6/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.3682 - accuracy: 0.8333 - val_loss: 0.5810 - val_accuracy: 0.7333
Epoch 7/1024
3/3 [==============================] - 0s 14ms/step - loss: 0.3688 - accuracy: 0.8409 - val_loss: 0.5391 - val_accuracy: 0.7333
Epoch 8/1024
3/3 [==============================] - 0s 16ms/step - loss: 0.3835 - accuracy: 0.8030 - val_loss: 0.4871 - val_accuracy: 0.8000
Epoch 9/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.3173 - accuracy: 0.8712 - val_loss: 0.4417 - val_accuracy: 0.7333
Epoch 10/1024
3/3 [==============================] - 0s 14ms/step - loss: 0.2877 - accuracy: 0.9015 - val_loss: 0.4076 - val_accuracy: 0.8000
Epoch 11/1024
3/3 [==============================] - 0s 16ms/step - loss: 0.3124 - accuracy: 0.8561 - val_loss: 0.3799 - val_accuracy: 0.8667
Epoch 12/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.3283 - accuracy: 0.8561 - val_loss: 0.3434 - val_accuracy: 0.8667
Epoch 13/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.2901 - accuracy: 0.8561 - val_loss: 0.3277 - val_accuracy: 0.9333
Epoch 14/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.2809 - accuracy: 0.8788 - val_loss: 0.4204 - val_accuracy: 0.8667
Epoch 15/1024
3/3 [==============================] - 0s 16ms/step - loss: 0.2645 - accuracy: 0.8788 - val_loss: 0.5395 - val_accuracy: 0.8000
Epoch 16/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.3128 - accuracy: 0.8561 - val_loss: 0.5048 - val_accuracy: 0.8000
Epoch 17/1024
3/3 [==============================] - 0s 13ms/step - loss: 0.2663 - accuracy: 0.8788 - val_loss: 0.3884 - val_accuracy: 0.8667
Epoch 18/1024
3/3 [==============================] - 0s 19ms/step - loss: 0.2469 - accuracy: 0.8939 - val_loss: 0.3553 - val_accuracy: 0.8667
Epoch 19/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.2702 - accuracy: 0.8788 - val_loss: 0.4156 - val_accuracy: 0.8667
Epoch 20/1024
3/3 [==============================] - 0s 14ms/step - loss: 0.2147 - accuracy: 0.9470 - val_loss: 0.4676 - val_accuracy: 0.8000
Epoch 21/1024
3/3 [==============================] - 0s 14ms/step - loss: 0.2660 - accuracy: 0.8864 - val_loss: 0.4032 - val_accuracy: 0.8000
Epoch 22/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.2014 - accuracy: 0.9242 - val_loss: 0.3460 - val_accuracy: 0.8000
Epoch 23/1024
3/3 [==============================] - 0s 16ms/step - loss: 0.2031 - accuracy: 0.9015 - val_loss: 0.3054 - val_accuracy: 0.8667
Epoch 24/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.1823 - accuracy: 0.9242 - val_loss: 0.2894 - val_accuracy: 0.8667
Epoch 25/1024
3/3 [==============================] - 0s 16ms/step - loss: 0.1730 - accuracy: 0.9167 - val_loss: 0.2811 - val_accuracy: 0.8667
Epoch 26/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.2031 - accuracy: 0.8788 - val_loss: 0.2826 - val_accuracy: 0.8667
Epoch 27/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.1576 - accuracy: 0.9394 - val_loss: 0.2894 - val_accuracy: 0.8667
Epoch 28/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.1758 - accuracy: 0.9091 - val_loss: 0.2795 - val_accuracy: 0.8000
Epoch 29/1024
3/3 [==============================] - 0s 16ms/step - loss: 0.1937 - accuracy: 0.9091 - val_loss: 0.2530 - val_accuracy: 0.9333
Epoch 30/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.1702 - accuracy: 0.9167 - val_loss: 0.2510 - val_accuracy: 0.9333
Epoch 31/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.1696 - accuracy: 0.9242 - val_loss: 0.2852 - val_accuracy: 0.8667
Epoch 32/1024
3/3 [==============================] - 0s 14ms/step - loss: 0.1343 - accuracy: 0.9470 - val_loss: 0.3397 - val_accuracy: 0.8667
Epoch 33/1024
3/3 [==============================] - 0s 14ms/step - loss: 0.1938 - accuracy: 0.8939 - val_loss: 0.3568 - val_accuracy: 0.8667
Epoch 34/1024
3/3 [==============================] - 0s 18ms/step - loss: 0.1543 - accuracy: 0.9242 - val_loss: 0.4122 - val_accuracy: 0.8667
Epoch 35/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.1643 - accuracy: 0.9242 - val_loss: 0.4189 - val_accuracy: 0.8667
Epoch 36/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.1813 - accuracy: 0.8939 - val_loss: 0.3169 - val_accuracy: 0.9333
Epoch 37/1024
3/3 [==============================] - 0s 16ms/step - loss: 0.1324 - accuracy: 0.9621 - val_loss: 0.2865 - val_accuracy: 0.8667
Epoch 38/1024
3/3 [==============================] - 0s 16ms/step - loss: 0.1522 - accuracy: 0.9167 - val_loss: 0.3107 - val_accuracy: 0.8667
Epoch 39/1024
3/3 [==============================] - 0s 14ms/step - loss: 0.1409 - accuracy: 0.9394 - val_loss: 0.3382 - val_accuracy: 0.8667
Epoch 40/1024
3/3 [==============================] - 0s 14ms/step - loss: 0.1739 - accuracy: 0.9091 - val_loss: 0.4148 - val_accuracy: 0.8667
Epoch 41/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.1565 - accuracy: 0.9167 - val_loss: 0.4279 - val_accuracy: 0.8667
Epoch 42/1024
3/3 [==============================] - 0s 13ms/step - loss: 0.1553 - accuracy: 0.9318 - val_loss: 0.4322 - val_accuracy: 0.8667
Epoch 43/1024
3/3 [==============================] - 0s 13ms/step - loss: 0.1377 - accuracy: 0.9394 - val_loss: 0.4630 - val_accuracy: 0.8667
Epoch 44/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.1576 - accuracy: 0.9318 - val_loss: 0.4501 - val_accuracy: 0.8667
Epoch 45/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.1737 - accuracy: 0.9242 - val_loss: 0.3803 - val_accuracy: 0.9333
Epoch 46/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.1263 - accuracy: 0.9242 - val_loss: 0.4227 - val_accuracy: 0.8667
Epoch 47/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.1545 - accuracy: 0.9091 - val_loss: 0.3532 - val_accuracy: 0.9333
Epoch 48/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.0902 - accuracy: 0.9621 - val_loss: 0.3640 - val_accuracy: 0.9333
Epoch 49/1024
3/3 [==============================] - 0s 13ms/step - loss: 0.1297 - accuracy: 0.9621 - val_loss: 0.3963 - val_accuracy: 0.9333
Epoch 50/1024
3/3 [==============================] - 0s 15ms/step - loss: 0.1196 - accuracy: 0.9470 - val_loss: 0.3899 - val_accuracy: 0.8667

<tensorflow.python.keras.callbacks.History at 0x7f48b0451438>

model.evaluate(inputs_test, labels_test)

1/1 [==============================] - 0s 991us/step - loss: 0.2510 - accuracy: 0.9333

[0.2510187029838562, 0.9333333373069763]

Prediction¶

On random sample¶

ind = int(np.random.uniform()*len(inputs_test))
spectrogram_out = inputs_test[ind]

ipd.Audio(spectrogram_out, rate=desired_sr)

y = labels_test[ind]
output = model.predict(np.expand_dims(np.array([inputs_test[ind]]), 0))

print("True label:", textLabel(np.argmax(y)))
print("Prediction:", textLabel(np.argmax(output)))

plt.imshow(spectrogram_out, cmap='hot', interpolation='nearest', aspect='auto')
plt.show()

audio = librosa.core.istft(spectrogram_out, hop_length=hop_len, win_length=win_len)
playback_sr, audio_interpolated = interpolateAudio(audio)
ipd.Audio(audio_interpolated, rate=playback_sr, autoplay=True)

True label: siren
Prediction: siren

Confusion matrix¶

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

predictions_test = model.predict(inputs_test)
cm = confusion_matrix(labels_test.argmax(axis=1), predictions_test.argmax(axis=1))

tck_labels = hotwords+["noise"] if ( sampleBackGround() ) else hotwords
sns.heatmap(cm, xticklabels=tck_labels, yticklabels=tck_labels, annot=True,fmt="d",)
plt.title("Validation Confusion Matrix")
plt.ylabel("A C T U A L")
plt.xlabel("P R E D I C T E D")
plt.show()

WARNING:tensorflow:Model was constructed with shape (None, None, 1025, 157) for input Tensor("add_channels_input:0", shape=(None, None, 1025, 157), dtype=float32), but it was called on an input with incompatible shape (None, 1025, 157).

On background and silence¶

background = spectrogramOp(np.random.random((desired_samples)))
silence = spectrogramOp(np.zeros((desired_samples)))
background_out, silence_out = model.predict(np.array([background, silence]))
print("Predicted ", textLabel(background_out.argmax()), "on random audio with vector", background_out)
print("Predicted ", textLabel(silence_out.argmax()), "on null audio with vector", silence_out)

Predicted  glass_breaking on random audio with vector [0. 1. 0. 0.]
Predicted  fireworks on null audio with vector [0.26653412 0.16112018 0.00151016 0.57083553]

On longer clips¶

n = 4 # number of samples

X, y = [], []
top_dir = 'audio/audio/audio'

for i in range(n):
    i = int(np.random.uniform()*len(df))
    file_path = file_path = top_dir + "/" + df.iloc[i]['filename']
    
    audio, sr = librosa.core.load(file_path, sr=desired_sr, mono=True)
    audio, interval = librosa.effects.trim(audio)

    if audio.shape[0]%desired_samples != 0: # if it needs padding, else, there will be unnecessary silence appended
        audio = np.pad(audio, (0, desired_samples - (audio.shape[0]%desired_samples)))
    
    audio = np.array(np.split(audio, int(audio.shape[0]*1.0/desired_samples)))

    X = np.append(X, audio)
    y.append(df.iloc[i]['category'])


X = np.pad(X, (0, desired_samples - (X.shape[0]%desired_samples)))

assert sr == desired_sr #ensure sample rate is same as desired
assert len(X.shape) == 1 #ensure X is a mono signal

spectrogram_out = spectrogramOp(X)
plt.imshow(spectrogram_out, cmap='hot', interpolation='nearest', aspect='auto')
plt.show()

print("The true labels are:", ', '.join(y))

ipd.Audio(X, rate=sr, autoplay=True)

The true labels are: coughing, siren, hen, vacuum_cleaner

tstep = 0.05 # predict every 0.2 sec
twin  = 2.0 # length of word
stride_len = int(desired_sr * tstep) 
twin_len   = int(desired_sr * twin)
activating_times = []
times = []
predictions = []
labels = []
thr = 0.8+(1.0-0.8)/n_classes
for n in range(0, len(X), stride_len):
  X_wind = X[n:n + twin_len]
  X_wind = np.pad(X_wind, int((desired_samples-X_wind.shape[0])/2))
  spectrogram_out = np.array([spectrogramOp(X_wind)])
  test_pred = model.predict(spectrogram_out)
  test_label = test_pred.argmax() 
  pred_prob = test_pred[0][test_label]
  pred_trustworthy = pred_prob > thr

  labels.append((test_label + 1 if (test_label < n_classes and pred_trustworthy) else 0)/4)
  prediction = test_pred[0][test_label] if (test_label < n_classes and pred_trustworthy) else 0
  time = n + 0.5*twin_len

  predictions.append(prediction)
  if (test_label < n_classes and pred_trustworthy):
    activating_times.append(time)

  times.append(time)
  #print(time, "-", n+twin_len, ":", textLabel(test_label), pred_trustworthy, pred_prob)

#Visualizing with matplotlib
fig, ax1 = plt.subplots(1,1,figsize=(20,8))
fig.patch.set_facecolor('#111111')
ax1.patch.set_facecolor('#111111')

green_color = "#0f0"
red_color = "#f00"
yellow_color = "#ff0"
ax1.set_xlabel("time windows", color=green_color)
ax1.set_ylabel("wake word detection prob.", color=green_color)

start_time = 0
end_time = 0

start_index = times.index(start_time) if start_time in times else 0
end_index = times.index(end_time) if end_time in times else len(times)

ax1.tick_params(axis='x', labelcolor=green_color)
ax1.tick_params(axis='y', labelcolor=green_color)

for i in range(n_classes):
    ax1.axhline((i+1)/n_classes, color='green', lw=2, alpha=0.5)
    plt.text(1, (i+1)/n_classes, textLabel(i), fontsize=12, va='center', ha='left', color='white')

for t in activating_times:
    ax1.axvline(t, color="#fff", alpha=0.2)

ax1.scatter(times[start_index:end_index], 
  predictions[start_index:end_index], color=red_color, alpha=1.0, marker='X')

X_time_axis = np.arange(times[start_index], times[end_index-1], (times[end_index-1]-times[start_index])/len(X))
ax1.scatter(X_time_axis, X+0.6, color='#444', marker='.', alpha=0.1)

plt.step(times[start_index:end_index], 
  labels[start_index:end_index], color=yellow_color)

plt.show()

deepC¶

model.save("subModel")

WARNING:tensorflow:From /opt/tljh/user/lib/python3.7/site-packages/tensorflow/python/training/tracking/tracking.py:111: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
WARNING:tensorflow:From /opt/tljh/user/lib/python3.7/site-packages/tensorflow/python/training/tracking/tracking.py:111: Layer.updates (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: subModel/assets

#Testing whether submodel works
tf_model = tf.saved_model.load("./subModel")
infer = tf_model.signatures["serving_default"]
input_val = np.full((1, inputs_train.shape[1], inputs_train.shape[2]), 3e-3)
input_val = np.zeros((1, inputs_train.shape[1], inputs_train.shape[2]))
output = infer(tf.constant(input_val, dtype=tf.float32))
tf_out = output[list(output.keys())[0]].numpy().flatten()[0]
print("tf out", tf_out)

tf out 0.26653415

#!deepCC --profile ./subModel_deepC/output.csv --format tensorflow ./subModel --debug
!deepCC --format tensorflow ./subModel --board='Arduino Nano 33 BLE Sense' --archive --debug

[INFO]
Reading [tensorflow model] './subModel'
[SUCCESS]
Saved 'subModel_deepC/subModel.onnx'
[INFO]
Reading [onnx model] 'subModel_deepC/subModel.onnx'
[INFO]
Model info:
  ir_vesion : 4
  doc       : 
[WARNING]
[ONNX]: terminal (input/output) add_channels_input_0's shape is less than 1. Changing it to 1.
[WARNING]
[ONNX]: terminal (input/output) add_channels_input_0's shape is less than 1. Changing it to 1.
[WARNING]
[ONNX]: terminal (input/output) Identity_0's shape is less than 1. Changing it to 1.
[INFO]
Running DNNC graph sanity check ...
[SUCCESS]
Passed sanity check.
[INFO]
Writing C++ file 'subModel_deepC/subModel.cpp'
[INFO]
deepSea model files are ready in 'subModel_deepC/' 
[RUNNING COMMAND]
arm-none-eabi-g++ -std=c++11 -O3 -mcpu=cortex-m4 -specs=nosys.specs -mthumb -fno-exceptions -fno-rtti -msoft-float -mfloat-abi=softfp -I. -I/opt/tljh/user/lib/python3.7/site-packages/deepC-0.13-py3.7-linux-x86_64.egg/deepC/include -I /opt/tljh/user/lib/python3.7/site-packages/deepC-0.13-py3.7-linux-x86_64.egg/deepC/packages/eigen-eigen-323c052e1731 -c "subModel_deepC/subModel.cpp" -o "subModel_deepC/subModel.o"
[RUNNING COMMAND]
arm-none-eabi-ar rcs "subModel_deepC/lib_subModel.a" "subModel_deepC/subModel.o"
[RUNNING COMMAND]
size "subModel_deepC/lib_subModel.a"
   text	   data	    bss	    dec	    hex	filename
 444181	      4	     96	 444281	  6c779	subModel.o (ex subModel_deepC/lib_subModel.a)
[SUCCESS]
Saved model as archive "subModel_deepC/lib_subModel.a"
[DEBUG]
Intermediate files won't be removed.

	slice_file_name	fsID	start	end	salience	fold	classID	class
0	100032-3-0-0.wav	100032	0.000000	0.317551	1	5	3	dog_bark
1	100263-2-0-117.wav	100263	58.500000	62.500000	1	5	2	children_playing
2	100263-2-0-121.wav	100263	60.500000	64.500000	1	5	2	children_playing
3	100263-2-0-126.wav	100263	63.000000	67.000000	1	5	2	children_playing
4	100263-2-0-137.wav	100263	68.500000	72.500000	1	5	2	children_playing
...	...	...	...	...	...	...	...	...
8727	99812-1-2-0.wav	99812	159.522205	163.522205	2	7	1	car_horn
8728	99812-1-3-0.wav	99812	181.142431	183.284976	2	7	1	car_horn
8729	99812-1-4-0.wav	99812	242.691902	246.197885	2	7	1	car_horn
8730	99812-1-5-0.wav	99812	253.209850	255.741948	2	7	1	car_horn
8731	99812-1-6-0.wav	99812	332.289233	334.821332	2	7	1	car_horn

	filename	fold	target	category	esc10	src_file	take
0	1-100032-A-0.wav	1	0	dog	True	100032	A
1	1-100038-A-14.wav	1	14	chirping_birds	False	100038	A
2	1-100210-A-36.wav	1	36	vacuum_cleaner	False	100210	A
3	1-100210-B-36.wav	1	36	vacuum_cleaner	False	100210	B
4	1-101296-A-19.wav	1	19	thunderstorm	False	101296	A
...	...	...	...	...	...	...	...
1995	5-263831-B-6.wav	5	6	hen	False	263831	B
1996	5-263902-A-36.wav	5	36	vacuum_cleaner	False	263902	A
1997	5-51149-A-25.wav	5	25	footsteps	False	51149	A
1998	5-61635-A-8.wav	5	8	sheep	False	61635	A
1999	5-9032-A-0.wav	5	0	dog	True	9032	A