from data_generator import vis_train_features

  # Extract label and audio features for single training sample
  vis_text, vis_raw_audio, vis_mfcc_feature, vis_spectrogram_feature, vis_audio_path = vis_train_features()

There are 2023 total training examples.


            
                from IPython.display import Markdown, display
  from data_generator import vis_train_features, plot_raw_audio
  from IPython.display import Audio
  %matplotlib inline

  # Plot audio signal
  plot_raw_audio(vis_raw_audio)

  # Print length of audio signal
  display(Markdown('**Shape of Audio Signal** : ' + str(vis_raw_audio.shape)))

  # Print corresponding transcript
  display(Markdown('**Transcript** : ' + str(vis_text)))

  # Play audio file
  Audio(vis_audio_path)


            
                from data_generator import plot_spectrogram_feature

  # Plot normalized spectrogram
  plot_spectrogram_feature(vis_spectrogram_feature)

  # Print spectrogram shape
  display(Markdown('**Shape of Spectrogram** : ' + str(vis_spectrogram_feature.shape)))


            
                from data_generator import plot_mfcc_feature

  # Plot normalized MFCC
  plot_mfcc_feature(vis_mfcc_feature)

  # Print MFCC shape
  display(Markdown('**Shape of MFCC** : ' + str(vis_mfcc_feature.shape)))


            
                #############################################################
  # RUN THIS CODE CELL IF RESUMING NOTEBOOK AFTER A BREAK #
  #############################################################

  from keras.backend.tensorflow_backend import set_session
  import tensorflow as tf 
  config = tf.ConfigProto()
  config.gpu_options.per_process_gpu_memory_fraction = 0.75
  set_session(tf.Session(config=config))

  # Watch for changes in `models.py` and reload automatically
  %load_ext autoreload
  %autoreload 2

  # Import NN architectures for speech recognition
  from models import *
  # Import function for training acoustic model
  from utils import train_model

Using TensorFlow backend.


            
                model_0 = simple_rnn_model(input_dim=13) # The `input_dim` = 13 for MFCC features

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
the_input (InputLayer)       (None, None, 13)          0         
_________________________________________________________________
rnn (GRU)                    (None, None, 29)          3741      
_________________________________________________________________
softmax (Activation)         (None, None, 29)          0         
=================================================================
Total params: 3,741
Trainable params: 3,741
Non-trainable params: 0
_________________________________________________________________
None


            
                train_model(input_to_softmax=model_0,
              pickle_path='model_0.pickle',
              save_model_path='model_0.h5',
              spectrogram=False)

Epoch 1/20
101/101 [==========] 278s - loss: 844.6487 - val_loss: 756.6623
Epoch 2/20
101/101 [==========] 256s - loss: 779.3528 - val_loss: 762.4975
Epoch 3/20
101/101 [==========] 254s - loss: 779.1221 - val_loss: 754.2807
Epoch 4/20
101/101 [==========] 255s - loss: 779.4376 - val_loss: 760.7240
Epoch 5/20
101/101 [==========] 260s - loss: 779.1932 - val_loss: 754.1836
...
Epoch 16/20
101/101 [==========] 260s - loss: 779.4956 - val_loss: 764.2389
Epoch 17/20
101/101 [==========] 257s - loss: 779.2465 - val_loss: 758.1152
Epoch 18/20
101/101 [==========] 254s - loss: 779.3760 - val_loss: 750.4402
Epoch 19/20
101/101 [==========] 259s - loss: 779.3372 - val_loss: 764.4173
Epoch 20/20
101/101 [==========] 260s - loss: 779.0816 - val_loss: 753.9043


            
                model_1 = rnn_model(input_dim=13,
                      units=200,
                      activation='relu')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
the_input (InputLayer)       (None, None, 13)          0         
_________________________________________________________________
rnn (GRU)                    (None, None, 200)         128400    
_________________________________________________________________
batch_normalization_1 (Batch (None, None, 200)         800       
_________________________________________________________________
time_distributed_1 (TimeDist (None, None, 29)          5829      
_________________________________________________________________
softmax (Activation)         (None, None, 29)          0         
=================================================================
Total params: 135,029
Trainable params: 134,629
Non-trainable params: 400
_________________________________________________________________
None


            
                train_model(input_to_softmax=model_1, 
              pickle_path='model_1.pickle', 
              save_model_path='model_1.h5',
              spectrogram=False)

Epoch 1/20
101/101 [==========] 261s - loss: 315.2228 - val_loss: 364.9819
Epoch 2/20
101/101 [==========] 260s - loss: 224.1788 - val_loss: 213.9861
Epoch 3/20
101/101 [==========] 256s - loss: 201.0221 - val_loss: 199.6651
Epoch 4/20
101/101 [==========] 251s - loss: 186.1088 - val_loss: 192.8514
Epoch 5/20
101/101 [==========] 260s - loss: 175.0228 - val_loss: 179.3166
...
Epoch 16/20
101/101 [==========] 259s - loss: 123.1043 - val_loss: 135.9445
Epoch 17/20
101/101 [==========] 259s - loss: 121.1695 - val_loss: 134.6261
Epoch 18/20
101/101 [==========] 261s - loss: 119.6366 - val_loss: 133.9350
Epoch 19/20
101/101 [==========] 261s - loss: 117.5602 - val_loss: 135.0312
Epoch 20/20
101/101 [==========] 262s - loss: 116.4232 - val_loss: 134.2130


            
                model_2 = cnn_rnn_model(input_dim=13,
                          filters=200,
                          kernel_size=11, 
                          conv_stride=2,
                          conv_border_mode='same',
                          units=200)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
the_input (InputLayer)       (None, None, 13)          0         
_________________________________________________________________
conv1d (Conv1D)              (None, None, 200)         28800     
_________________________________________________________________
bn_conv_1d (BatchNormalizati (None, None, 200)         800       
_________________________________________________________________
rnn (SimpleRNN)              (None, None, 200)         80200     
_________________________________________________________________
batch_normalization_2 (Batch (None, None, 200)         800       
_________________________________________________________________
time_distributed_2 (TimeDist (None, None, 29)          5829      
_________________________________________________________________
softmax (Activation)         (None, None, 29)          0         
=================================================================
Total params: 116,429
Trainable params: 115,629
Non-trainable params: 800
_________________________________________________________________
None


            
                train_model(input_to_softmax=model_2, 
              pickle_path='model_2.pickle', 
              save_model_path='model_2.h5', 
              spectrogram=False)

Epoch 1/20
101/101 [==========] 114s - loss: 253.1224 - val_loss: 213.2807
Epoch 2/20
101/101 [==========] 111s - loss: 182.4856 - val_loss: 175.2272
Epoch 3/20
101/101 [==========] 109s - loss: 157.5329 - val_loss: 154.6968
Epoch 4/20
101/101 [==========] 112s - loss: 145.0075 - val_loss: 147.8111
Epoch 5/20
101/101 [==========] 111s - loss: 137.8364 - val_loss: 140.9015
...
Epoch 16/20
101/101 [==========] 110s - loss: 108.8807 - val_loss: 128.9220
Epoch 17/20
101/101 [==========] 109s - loss: 107.7632 - val_loss: 129.8091
Epoch 18/20
101/101 [==========] 109s - loss: 106.1927 - val_loss: 128.8591
Epoch 19/20
101/101 [==========] 110s - loss: 105.3344 - val_loss: 126.7256
Epoch 20/20
101/101 [==========] 111s - loss: 103.9792 - val_loss: 130.1807


            
                model_3 = deep_rnn_model(input_dim=13,
                          units=200,
                          recur_layers=2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
the_input (InputLayer)       (None, None, 13)          0         
_________________________________________________________________
rnn0 (GRU)                   (None, None, 200)         128400    
_________________________________________________________________
batch_normalization_3 (Batch (None, None, 200)         800       
_________________________________________________________________
rnn1 (GRU)                   (None, None, 200)         240600    
_________________________________________________________________
batch_normalization_4 (Batch (None, None, 200)         800       
_________________________________________________________________
time_distributed_3 (TimeDist (None, None, 29)          5829      
_________________________________________________________________
softmax (Activation)         (None, None, 29)          0         
=================================================================
Total params: 376,429
Trainable params: 375,629
Non-trainable params: 800
_________________________________________________________________
None


            
                train_model(input_to_softmax=model_3, 
              pickle_path='model_3.pickle', 
              save_model_path='model_3.h5', 
              spectrogram=False)

Epoch 1/20
101/101 [==========] 420s - loss: 294.5375 - val_loss: 326.4501
Epoch 2/20
101/101 [==========] 426s - loss: 228.3170 - val_loss: 218.7167
Epoch 3/20
101/101 [==========] 424s - loss: 192.3747 - val_loss: 193.8009
Epoch 4/20
101/101 [==========] 427s - loss: 171.3817 - val_loss: 169.5791
Epoch 5/20
101/101 [==========] 426s - loss: 157.6423 - val_loss: 159.3954
...
Epoch 16/20
101/101 [==========] 428s - loss: 116.7533 - val_loss: 132.6016
Epoch 17/20
101/101 [==========] 423s - loss: 114.8657 - val_loss: 131.2636
Epoch 18/20
101/101 [==========] 427s - loss: 113.8633 - val_loss: 134.3393
Epoch 19/20
101/101 [==========] 422s - loss: 112.4764 - val_loss: 131.3335
Epoch 20/20
101/101 [==========] 423s - loss: 114.2621 - val_loss: 134.7672


            
                model_4 = bidirectional_rnn_model(input_dim=13,
                                    units=200)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
the_input (InputLayer)       (None, None, 13)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 400)         256800    
_________________________________________________________________
time_distributed_4 (TimeDist (None, None, 29)          11629     
_________________________________________________________________
softmax (Activation)         (None, None, 29)          0         
=================================================================
Total params: 268,429
Trainable params: 268,429
Non-trainable params: 0
_________________________________________________________________
None


            
                train_model(input_to_softmax=model_4, 
              pickle_path='model_4.pickle', 
              save_model_path='model_4.h5', 
              spectrogram=False)

Epoch 1/20
101/101 [==========] 417s - loss: 286.0226 - val_loss: 214.4835
Epoch 2/20
101/101 [==========] 421s - loss: 211.8139 - val_loss: 196.9271
Epoch 3/20
101/101 [==========] 420s - loss: 199.4706 - val_loss: 190.5670
Epoch 4/20
101/101 [==========] 411s - loss: 191.0267 - val_loss: 181.3404
Epoch 5/20
101/101 [==========] 422s - loss: 182.7790 - val_loss: 178.2257
...
Epoch 16/20
101/101 [==========] 423s - loss: 131.3034 - val_loss: 139.9685
Epoch 17/20
101/101 [==========] 426s - loss: 128.0395 - val_loss: 141.6328
Epoch 18/20
101/101 [==========] 422s - loss: 125.3457 - val_loss: 137.0013
Epoch 19/20
101/101 [==========] 426s - loss: 122.5610 - val_loss: 137.2083
Epoch 20/20
101/101 [==========] 422s - loss: 119.9599 - val_loss: 135.8749


            
                from glob import glob
  import numpy as np
  import _pickle as pickle
  import seaborn as sns
  import matplotlib.pyplot as plt
  %matplotlib inline
  sns.set_style(style='white')

  # Obtain saved models
  all_pickles = sorted(glob("results/*.pickle"))
  model_names = [item[8:-7] for item in all_pickles]
  valid_loss = [pickle.load( open( i, "rb" ) )['val_loss'] for i in all_pickles]
  train_loss = [pickle.load( open( i, "rb" ) )['loss'] for i in all_pickles]
  num_epochs = [len(valid_loss[i]) for i in range(len(valid_loss))]

  fig = plt.figure(figsize=(16,5))

  # Plot training loss vs. epoch for each model
  ax1 = fig.add_subplot(121)
  for i in range(len(all_pickles)):
      ax1.plot(np.linspace(1, num_epochs[i], num_epochs[i]), 
              train_loss[i], label=model_names[i])
  # Clean up plot
  ax1.legend()  
  ax1.set_xlim([1, max(num_epochs)])
  plt.xlabel('Epoch')
  plt.ylabel('Training Loss')

  # Plot validation loss vs. epoch for each model
  ax2 = fig.add_subplot(122)
  for i in range(len(all_pickles)):
      ax2.plot(np.linspace(1, num_epochs[i], num_epochs[i]), 
              valid_loss[i], label=model_names[i])
  # Clean up plot
  ax2.legend()  
  ax2.set_xlim([1, max(num_epochs)])
  plt.xlabel('Epoch')
  plt.ylabel('Validation Loss')
  plt.show()


            
                model_end = final_model(input_dim = 13,
                          filters = 200,
                          kernel_size = 11, 
                          conv_stride = 2,
                          conv_border_mode='same',
                          units = 200)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
the_input (InputLayer)       (None, None, 13)          0         
_________________________________________________________________
conv1d (Conv1D)              (None, None, 200)         28800     
_________________________________________________________________
batch_normalization_5 (Batch (None, None, 200)         800       
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 400)         481200    
_________________________________________________________________
batch_normalization_6 (Batch (None, None, 400)         1600      
_________________________________________________________________
bidirectional_3 (Bidirection (None, None, 400)         721200    
_________________________________________________________________
batch_normalization_7 (Batch (None, None, 400)         1600      
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 400)         0         
_________________________________________________________________
time_distributed_5 (TimeDist (None, None, 29)          11629     
_________________________________________________________________
softmax (Activation)         (None, None, 29)          0         
=================================================================
Total params: 1,246,829
Trainable params: 1,244,829
Non-trainable params: 2,000
_________________________________________________________________
None


            
                train_model(input_to_softmax=model_end, 
              pickle_path='model_end.pickle', 
              save_model_path='model_end.h5', 
              spectrogram=False)

Epoch 1/20
101/101 [==========] 413s - loss: 261.3609 - val_loss: 223.9081
Epoch 2/20
101/101 [==========] 413s - loss: 196.2226 - val_loss: 181.6405
Epoch 3/20
101/101 [==========] 414s - loss: 166.5231 - val_loss: 149.9131
Epoch 4/20
101/101 [==========] 414s - loss: 149.5870 - val_loss: 142.6551
Epoch 5/20
101/101 [==========] 415s - loss: 136.9037 - val_loss: 128.4056
...
Epoch 16/20
101/101 [==========] 412s - loss: 73.4453 - val_loss: 113.9058
Epoch 17/20
101/101 [==========] 409s - loss: 69.8473 - val_loss: 113.9859
Epoch 18/20
101/101 [==========] 408s - loss: 66.1584 - val_loss: 114.8722
Epoch 19/20
101/101 [==========] 409s - loss: 63.2991 - val_loss: 116.1796
Epoch 20/20
101/101 [==========] 409s - loss: 60.3402 - val_loss: 118.9479


            
                import numpy as np
  from data_generator import AudioGenerator
  from keras import backend as K
  from utils import int_sequence_to_text
  from IPython.display import Audio

  def get_predictions(index, partition, input_to_softmax, model_path):
      """ Print a model's decoded predictions
      Params:
          index (int): The example you would like to visualize
          partition (str): One of 'train' or 'validation'
          input_to_softmax (Model): The acoustic model
          model_path (str): Path to saved acoustic model's weights
      """
      # Load train and test data
      data_gen = AudioGenerator()
      data_gen.load_train_data()
      data_gen.load_validation_data()

      # Obtain true transcription and audio features 
      if partition == 'validation':
          transcr = data_gen.valid_texts[index]
          audio_path = data_gen.valid_audio_paths[index]
          data_point = data_gen.normalize(data_gen.featurize(audio_path))
      elif partition == 'train':
          transcr = data_gen.train_texts[index]
          audio_path = data_gen.train_audio_paths[index]
          data_point = data_gen.normalize(data_gen.featurize(audio_path))
      else:
          raise Exception('Invalid partition!  Must be "train" or "validation"')

      # Obtain and decode acoustic model predictions
      input_to_softmax.load_weights(model_path)
      prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
      output_length = [input_to_softmax.output_length(data_point.shape[0])] 
      pred_ints = (K.eval(K.ctc_decode(
                  prediction, output_length)[0][0])+1).flatten().tolist()

      # Play audio file, and display true and predicted transcriptions
      print('-'*80)
      Audio(audio_path)
      print('True transcription:\n' + '\n' + transcr)
      print('-'*80)
      print('Predicted transcription:\n' + '\n' + ''.join(int_sequence_to_text(pred_ints)))
      print('-'*80)


            
                get_predictions(index=0, 
                  partition='train',
                  input_to_softmax=final_model(input_dim = 13,
                                              filters = 200,
                                              kernel_size = 11,
                                              conv_stride = 2,
                                              conv_border_mode='same',
                                              units = 200), 
                  model_path="./results/model_end.h5")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
the_input (InputLayer)       (None, None, 13)          0         
_________________________________________________________________
conv1d (Conv1D)              (None, None, 200)         28800     
_________________________________________________________________
batch_normalization_17 (Batc (None, None, 200)         800       
_________________________________________________________________
bidirectional_10 (Bidirectio (None, None, 400)         481200    
_________________________________________________________________
batch_normalization_18 (Batc (None, None, 400)         1600      
_________________________________________________________________
bidirectional_11 (Bidirectio (None, None, 400)         721200    
_________________________________________________________________
batch_normalization_19 (Batc (None, None, 400)         1600      
_________________________________________________________________
dropout_10 (Dropout)         (None, None, 400)         0         
_________________________________________________________________
time_distributed_9 (TimeDist (None, None, 29)          11629     
_________________________________________________________________
softmax (Activation)         (None, None, 29)          0         
=================================================================
Total params: 1,246,829
Trainable params: 1,244,829
Non-trainable params: 2,000
_________________________________________________________________
None

True transcription:
her father is a most remarkable person to say the least

Predicted transcription:
her father is a most r markcabe persont to sey the least


            
                get_predictions(index=100, 
                  partition='validation',
                  input_to_softmax=final_model(input_dim = 13,
                                              filters = 200,
                                              kernel_size = 11,
                                              conv_stride = 2,
                                              conv_border_mode='same',
                                              units = 200), 
                  model_path="./results/model_end.h5")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
the_input (InputLayer)       (None, None, 13)          0         
_________________________________________________________________
conv1d (Conv1D)              (None, None, 200)         28800     
_________________________________________________________________
batch_normalization_44 (Batc (None, None, 200)         800       
_________________________________________________________________
bidirectional_28 (Bidirectio (None, None, 400)         481200    
_________________________________________________________________
batch_normalization_45 (Batc (None, None, 400)         1600      
_________________________________________________________________
bidirectional_29 (Bidirectio (None, None, 400)         721200    
_________________________________________________________________
batch_normalization_46 (Batc (None, None, 400)         1600      
_________________________________________________________________
dropout_28 (Dropout)         (None, None, 400)         0         
_________________________________________________________________
time_distributed_18 (TimeDis (None, None, 29)          11629     
_________________________________________________________________
softmax (Activation)         (None, None, 29)          0         
=================================================================
Total params: 1,246,829
Trainable params: 1,244,829
Non-trainable params: 2,000
_________________________________________________________________
None

True transcription:
i was absent rather more than an hour

Predicted transcription:
i was apsen other morthen an hour

Speech Recognizer¶

1.0 Preprocessing¶

1.1 Sample the Data¶

1.2 Choosing Feature Representation¶

1.3 Spectrograms¶

1.4 Mel-Frequency Cepstral Coefficients (MFCCs)¶

2.0 Models¶

2.1 Simple RNN¶

2.2 RNN + TimeDistributed Dense¶

2.3 CNN + RNN + TimeDistributed Dense¶

2.4 Deeper RNN + TimeDistributed Dense¶

2.5 Bidirectional RNN + TimeDistributed Dense¶

2.6 Compare the Models¶

2.7 Final Model¶

3.0 Prediction¶