import os
  import sys
  sys.path.append('/opt/cocoapi/PythonAPI')
  from pycocotools.coco import COCO

  # Initialize COCO API for instance annotations
  dataDir = '/opt/cocoapi'
  dataType = 'val2014'
  instances_annFile = os.path.join(dataDir, 'annotations/instances_{}.json'.format(dataType))
  coco = COCO(instances_annFile)

  # Initialize COCO API for caption annotations
  captions_annFile = os.path.join(dataDir, 'annotations/captions_{}.json'.format(dataType))
  coco_caps = COCO(captions_annFile)

  # Get image IDs 
  ids = list(coco.anns.keys())

loading annotations into memory...
Done (t=6.34s)
creating index...
index created!
loading annotations into memory...
Done (t=0.97s)
creating index...
index created!


              
                  import numpy as np
  import skimage.io as io
  import matplotlib.pyplot as plt
  %matplotlib inline

  # Get URL for random image
  ann_id = np.random.choice(ids)
  img_id = coco.anns[ann_id]['image_id']
  img = coco.loadImgs(img_id)[0]
  url = img['coco_url']
  
  # Print URL and plot image
  print(url)
  I = io.imread(url)
  plt.axis('off')
  plt.imshow(I)
  plt.show()

  # Load and print captions
  annIds = coco_caps.getAnnIds(imgIds=img['id'])
  anns = coco_caps.loadAnns(annIds)
  coco_caps.showAnns(anns)

http://images.cocodataset.org/val2014/COCO_val2014_000000219820.jpg

A small boat is going down the river in front of colorful trees.
A terraced hill in fall colors going down to the water with a boat on it.
a red blue and yellow boat and some red trees
Boat on water above trees with fall foliage.
A series of steep stairs lay next to a lake


              
                  import sys
  sys.path.append('/opt/cocoapi/PythonAPI')
  from pycocotools.coco import COCO
  %pip install nltk
  import nltk
  nltk.download('punkt')
  from data_loader import get_loader
  from torchvision import transforms

  # Define a transform to pre-process the training images
  transform_train = transforms.Compose([ 
      transforms.Resize(256),                          # Resize smallest dim to 256 pixels
      transforms.RandomCrop(224),                      # Randomly crop 224x224 segment
      transforms.RandomHorizontalFlip(),               # Horizontal mirror with probability=0.5
      transforms.ToTensor(),                           # Convert PIL to Tensor
      transforms.Normalize((0.485, 0.456, 0.406),      # Normalize image for pre-trained model
                          (0.229, 0.224, 0.225))])

  # Specify the batch size
  batch_size = 10

  # Set the minimum word count threshold
  vocab_threshold = 5

  # Obtain the data loader
  data_loader = get_loader(transform=transform_train,
                            mode='train',
                            batch_size=batch_size,
                            vocab_threshold=vocab_threshold,
                            vocab_from_file=False)

Requirement already satisfied: nltk in /opt/conda/lib/python3.6/site-packages (3.2.5)
Requirement already satisfied: six in /opt/conda/lib/python3.6/site-packages (from nltk) (1.11.0)
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
loading annotations into memory...
Done (t=1.06s)
creating index...
index created!
[0/414113] Tokenizing captions...
[100000/414113] Tokenizing captions...
[200000/414113] Tokenizing captions...
[300000/414113] Tokenizing captions...
[400000/414113] Tokenizing captions...
loading annotations into memory...
Done (t=0.97s)
creating index...

  0%|          | 865/414113 [00:00<01:34, 4351.31it/s]

index created!
Obtaining caption lengths...

100%|██████████| 414113/414113 [01:36<00:00, 4289.59it/s]


              
                  from collections import Counter

  counter = Counter(data_loader.dataset.caption_lengths)
  lengths = sorted(counter.items(), key=lambda pair: pair[1], reverse=True)
  for value, count in lengths:
      print('value: %2d --- count: %5d' % (value, count))

value: 10 --- count: 86334
value: 11 --- count: 79948
value:  9 --- count: 71934
value: 12 --- count: 57637
value: 13 --- count: 37645
...
value: 56 --- count:     2
value:  6 --- count:     2
value: 53 --- count:     2
value: 55 --- count:     2
value: 57 --- count:     1


              
                  import numpy as np
  import torch.utils.data as data

  # Randomly choose caption length, then sample indices with that length
  indices = data_loader.dataset.get_train_indices()
  print('sampled indices:', indices)

  # Create and assign batch sampler to retrieve batch with sampled indices
  new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
  data_loader.batch_sampler.sampler = new_sampler
      
  # Obtain and print batch
  images, captions = next(iter(data_loader))
  print('images.shape:', images.shape)
  print('captions.shape:', captions.shape)

  # Print preprocessed images and captions
  print('images:', images)
  print('captions:', captions)

sampled indices: [399799, 210848, 364086, 368244, 239616, 63840, 326666, 301245, 271280, 128451]
images.shape: torch.Size([10, 3, 224, 224])
captions.shape: torch.Size([10, 12])
images: tensor(...)
captions: tensor(...)


              
                  from model import EncoderCNN, DecoderRNN

  # Watch for changes in `model.py``, and re-load automatically
  % load_ext autoreload
  % autoreload 2

  # Determine which device will be active
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


              
                  # Image bedding size
  embed_size = 256

  # Initialize encoder
  encoder = EncoderCNN(embed_size)

  # Move encoder to GPU if CUDA available
  encoder.to(device)
      
  # Move images to GPU if CUDA available.   
  images = images.to(device)

  # Pass images to encoder
  features = encoder(images)

  print('type(features):', type(features))
  print('features.shape:', features.shape)

  # Check if I broke any pipes
  assert type(features)==torch.Tensor, "Encoder output needs to be a PyTorch Tensor." 
  assert (features.shape[0]==batch_size) & (features.shape[1]==embed_size), "The shape of the encoder output is incorrect."

Downloading: "https://download.pytorch.org/models/resnet50-19c8e357.pth"
to /root/.torch/models/resnet50-19c8e357.pth
100%|██████████| 102502400/102502400 [00:01<00:00, 59702705.38it/s]

type(features): <class 'torch.Tensor'>
features.shape: torch.Size([10, 256])


              
                  # RNN hidden state size
  hidden_size = 512

  # Get size of vocabulary
  vocab_size = len(data_loader.dataset.vocab)

  # Initialize decoder
  decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

  # Move decoder to GPU if CUDA available.
  decoder.to(device)
      
  # Move captions to GPU if CUDA available
  captions = captions.to(device)
  # Pass both encoder output and captions to decoder
  outputs = decoder(features, captions)

  print('type(outputs):', type(outputs))
  print('outputs.shape:', outputs.shape)

  # Check if I broke any pipes
  assert type(outputs)==torch.Tensor, "Decoder output needs to be a PyTorch Tensor."
  assert (outputs.shape[0]==batch_size) & (outputs.shape[1]==captions.shape[1]) & (outputs.shape[2]==vocab_size), "The shape of the decoder output is incorrect."

type(outputs): <class 'torch.Tensor'>
outputs.shape: torch.Size([10, 12, 8855])


              
                  import torch
  import torch.nn as nn
  from torchvision import transforms
  import sys
  sys.path.append('/opt/cocoapi/PythonAPI')
  from pycocotools.coco import COCO
  from data_loader import get_loader
  from model import EncoderCNN, DecoderRNN
  import math

  # Hyperparameters
  batch_size = 64
  vocab_threshold = 5
  vocab_from_file = False
  embed_size = 256
  hidden_size = 512
  num_epochs = 1
  save_every = 1
  print_every = 100
  log_file = 'training_log.txt'

  # Transforms
  transform_train = transforms.Compose([ 
      transforms.Resize(256),
      transforms.RandomCrop(224),
      transforms.RandomHorizontalFlip(),
      transforms.ToTensor(),
      transforms.Normalize((0.485, 0.456, 0.406),
                          (0.229, 0.224, 0.225))])

  # Data Loader
  data_loader = get_loader(transform=transform_train,
                            mode='train',
                            batch_size=batch_size,
                            vocab_threshold=vocab_threshold,
                            vocab_from_file=vocab_from_file)

  # Vocabulary
  vocab_size = len(data_loader.dataset.vocab)

  # Initialize networks
  encoder = EncoderCNN(embed_size)
  decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

  # Move models to GPU if CUDA available
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  encoder.to(device)
  decoder.to(device)

  # Loss function 
  criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()

  # Specify learnable parameters
  params = list(decoder.parameters()) + list(encoder.embed.parameters())

  # Optimizer
  optimizer = torch.optim.Adam(params)

  # Steps per epoch
  total_step = math.ceil(len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size)

loading annotations into memory...
Done (t=1.04s)
creating index...
index created!
[0/414113] Tokenizing captions...
[100000/414113] Tokenizing captions...
[200000/414113] Tokenizing captions...
[300000/414113] Tokenizing captions...
[400000/414113] Tokenizing captions...
loading annotations into memory...
Done (t=0.97s)
creating index...

  0%|          | 903/414113 [00:00<01:31, 4533.09it/s]

index created!
Obtaining caption lengths...

100%|██████████| 414113/414113 [01:33<00:00, 4441.47it/s]


              
                  import torch.utils.data as data
  import numpy as np
  import os
  import requests
  import time

  # Open training log file
  f = open(log_file, 'w')

  old_time = time.time()
  response = requests.request("GET", 
                              "http://metadata.google.internal/computeMetadata/v1/instance/attributes/keep_alive_token", 
                              headers={"Metadata-Flavor":"Google"})

  for epoch in range(1, num_epochs+1):
      
      for i_step in range(1, total_step+1):
          
          if time.time() - old_time > 60:
              old_time = time.time()
              requests.request("POST", 
                              "https://nebula.udacity.com/api/v1/remote/keep-alive", 
                              headers={'Authorization': "STAR " + response.text})
          
          # Randomly sample caption length, then sample indices with that length
          indices = data_loader.dataset.get_train_indices()

          # Create and assign batch sampler to retrieve batch with sampled indices
          new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
          data_loader.batch_sampler.sampler = new_sampler
          
          # Obtain batch
          images, captions = next(iter(data_loader))

          # Move images and captions to GPU if CUDA available.
          images = images.to(device)
          captions = captions.to(device)
          
          # Zero gradients
          decoder.zero_grad()
          encoder.zero_grad()
          
          # Pass inputs through CNN-RNN model
          features = encoder(images)
          outputs = decoder(features, captions)
          
          # Calculate batch loss
          loss = criterion(outputs.view(-1, vocab_size), captions.view(-1))
          
          # Backward pass
          loss.backward()
          
          # Update parameters in optimizer
          optimizer.step()
              
          # Get training statistics
          stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, num_epochs, i_step, total_step, loss.item(), np.exp(loss.item()))
          
          # Print training statistics (on same line)
          print('\r' + stats, end="")
          sys.stdout.flush()
          
          # Print training statistics to file
          f.write(stats + '\n')
          f.flush()
          
          # Print training statistics (on different line)
          if i_step % print_every == 0:
              print('\r' + stats)
              
      # Save weights
      if epoch % save_every == 0:
          torch.save(decoder.state_dict(), os.path.join('./models', 'decoder-%d.pkl' % epoch))
          torch.save(encoder.state_dict(), os.path.join('./models', 'encoder-%d.pkl' % epoch))

  # Close training log file
  f.close()

Epoch [1/1], Step [100/6471], Loss: 3.9504, Perplexity: 51.9548
Epoch [1/1], Step [200/6471], Loss: 3.6203, Perplexity: 37.34829
Epoch [1/1], Step [300/6471], Loss: 3.4357, Perplexity: 31.05318
Epoch [1/1], Step [400/6471], Loss: 3.0031, Perplexity: 20.1469
Epoch [1/1], Step [500/6471], Loss: 2.8526, Perplexity: 17.3336
...
Epoch [1/1], Step [6100/6471], Loss: 2.0174, Perplexity: 7.518914
Epoch [1/1], Step [6200/6471], Loss: 2.8182, Perplexity: 16.7471
Epoch [1/1], Step [6300/6471], Loss: 2.2008, Perplexity: 9.03214
Epoch [1/1], Step [6400/6471], Loss: 2.1075, Perplexity: 8.22739
Epoch [1/1], Step [6471/6471], Loss: 2.0801, Perplexity: 8.00519


              
                  import sys
  sys.path.append('/opt/cocoapi/PythonAPI')
  from pycocotools.coco import COCO
  from data_loader import get_loader
  from torchvision import transforms

  # Re-define transforms for testing
  transform_test = transforms.Compose([
      transforms.Resize(256),
      transforms.RandomCrop(224),
      transforms.ToTensor(),
      transforms.Normalize((0.485, 0.456, 0.406),
                          (0.229, 0.224, 0.225))])

  # Create data loader
  data_loader = get_loader(transform=transform_test,    
                            mode="test")

Vocabulary successfully loaded from vocab.pkl file!


              
                  import numpy as np
  import matplotlib.pyplot as plt
  %matplotlib inline

  # Obtain sample image before and after pre-processing.
  orig_image, image = next(iter(data_loader))

  # Visualize sample image, before pre-processing.
  plt.imshow(np.squeeze(orig_image))
  plt.title('example image')
  plt.show()


              
                  import os
  import torch
  from model import EncoderCNN, DecoderRNN

  # Load the following pickles
  encoder_file = "encoder-1.pkl"
  decoder_file = "decoder-1.pkl"

  # Re-define size
  embed_size = 256
  hidden_size = 512

  # Vocabulary
  vocab_size = len(data_loader.dataset.vocab)

  # Initialize encoder and decoder; set each to INFERENCE
  encoder = EncoderCNN(embed_size)
  encoder.eval()
  decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
  decoder.eval()

  # Load trained weights
  encoder.load_state_dict(torch.load(os.path.join('./models', encoder_file)))
  decoder.load_state_dict(torch.load(os.path.join('./models', decoder_file)))

  # Move models to GPU if CUDA available.
  encoder.to(device)
  decoder.to(device)

DecoderRNN(
  (embed): Embedding(8855, 256)
  (lstm): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=8855, bias=True)
)


              
                  # Move image Pytorch Tensor to GPU if CUDA available
  image = image.to(device)

  # Obtain embedded image features
  features = encoder(image).unsqueeze(1)

  # Pass embedded image features through model to get predicted caption
  output = decoder.sample(features)
  print('example output:', output)

  assert (type(output)==list), "Output needs to be a Python list" 
  assert all([type(x)==int for x in output]), "Output should be a list of integers." 
  assert all([x in data_loader.dataset.vocab.idx2word for x in output]), "Each entry in the output needs to correspond to an integer that indicates a token in the vocabulary."

example output: [0, 3, 33, 30, 21, 3, 33, 30, 39, 46, 18, 1, 1, 18, 1, 1, 18, 1, 1, 18]


              
                  def clean_sentence(output):
  sentence = ""
  
  for i in output:
      word = data_loader.dataset.vocab.idx2word[i]
      if i ==  0:        # 0 = START
          continue
      elif i == 1:       # 1 = END
          break
      else:
          sentence = sentence + " " + word
          
  return sentence.strip()


              
                  sentence = clean_sentence(output)
  print('example sentence:', sentence)

  assert type(sentence)==str, 'Sentence needs to be a Python string!'

example sentence: a street sign with a street sign on it .


              
                      def get_prediction():
      orig_image, image = next(iter(data_loader))
      plt.imshow(np.squeeze(orig_image))
      plt.title('Sample Image')
      plt.show()
      image = image.to(device)
      features = encoder(image).unsqueeze(dim=1)
      output = decoder.sample(features)    
      sentence = clean_sentence(output)
      print(sentence)


              
                get_prediction()

a woman is holding a box of doughnuts .


              
                get_prediction()

a zebra is eating hay from a feeder .


              
                get_prediction()

a man is surfing on a wave in the ocean .


              
                get_prediction()

a group of people standing around a table with a cake .


              
                get_prediction()

a red stop sign sitting on top of a pole .

Image Captioner¶

1.0 Understand the Data¶

1.1 Initialize the COCO API¶

1.2 Plot a Sample¶

2.0 Preprocess the Data¶

2.1 Udacity's Data Loader¶

2.2 Load the Data¶

2.3 Assemble the Network¶

2.4 Implement the CNN Encoder¶

2.5 Implement the RNN Decoder¶

3.0 Train the Model¶

3.1 Nanodegree Questions¶

Question #1¶

Question #2¶

Question #3¶

Question #4¶

3.2 Train¶

4.0 Test the Model¶

4.1 Load the Models¶

4.2 Create the Sampler¶

4.3: Clean the Captions¶

4.4 Generate Predictions¶

4.6 The Model Performed Well¶

4.6 The Model Could Perform Better...¶