d-li14
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 22 additions & 11 deletions b/‎README.md‎
Lines changed: 22 additions & 11 deletions
diff --git a/‎data/__init__.py‎
Lines changed: 23 additions & 2 deletions b/‎data/__init__.py‎
Lines changed: 23 additions & 2 deletions
diff --git a/‎data/coco.py‎
Lines changed: 180 additions & 0 deletions b/‎data/coco.py‎
Lines changed: 180 additions & 0 deletions
diff --git a/‎data/config.py‎
Lines changed: 29 additions & 57 deletions b/‎data/config.py‎
Lines changed: 29 additions & 57 deletions
@@ -123,3 +123,6 @@ test_data_aug.py
 # temp checkout soln
 data/datasets/
 data/ssd_dataloader.py
+
+# pylint
+.pylintrc
@@ -1,12 +1,14 @@
 # SSD: Single Shot MultiBox Object Detector, in PyTorch
 A [PyTorch](http://pytorch.org/) implementation of [Single Shot MultiBox Detector](http://arxiv.org/abs/1512.02325) from the 2016 paper by Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, Cheng-Yang, and Alexander C. Berg.  The official and original Caffe code can be found [here](https://github.com/weiliu89/caffe/tree/ssd). 
 
+***UPDATE:*** We have just added support for MS COCO! Check it out [below](#coco).
+
 ## Authors
 
 * [**Max deGroot**](https://github.com/amdegroot)
 * [**Ellis Brown**](http://github.com/ellisbrown)
 
-***Note:*** Unfortunately, this is just a hobby of ours and not a full-time job, so we'll do our best to keep things up to date, but no guarantees.  That being said, thanks to everyone for your continued help and feedback as it is really appreciated. We will try to address everything as soon as possible. 
+***Note:*** Unfortunately, this is just a hobby for us and not a full-time job, so we'll do our best to keep things up to date, but no guarantees.  That being said, thanks to everyone for your continued help and feedback as it is really appreciated. We will try to address everything as soon as possible. 
 
 
 <img align="right" src= "https://github.com/amdegroot/ssd.pytorch/blob/master/doc/ssd.png" height = 400/>
@@ -30,7 +32,7 @@ A [PyTorch](http://pytorch.org/) implementation of [Single Shot MultiBox Detecto
 - Install [PyTorch](http://pytorch.org/) by selecting your environment on the website and running the appropriate command.
 - Clone this repository.
   * Note: We currently only support Python 3+.
-- Then download the dataset by following the [instructions](#download-voc2007-trainval--test) below.
+- Then download the dataset by following the [instructions](#datasets) below.
 - We now support [Visdom](https://github.com/facebookresearch/visdom) for real-time loss visualization during training! 
   * To use Visdom in the browser: 
   ```Shell
@@ -40,21 +42,31 @@ A [PyTorch](http://pytorch.org/) implementation of [Single Shot MultiBox Detecto
   python -m visdom.server
   ```
   * Then (during training) navigate to http://localhost:8097/ (see the Train section below for training details).
-- Note: For training, we currently only support [VOC](http://host.robots.ox.ac.uk/pascal/VOC/), but are adding [COCO](http://mscoco.org/) and hopefully [ImageNet](http://www.image-net.org/) soon. 
+- Note: For training, we currently support [VOC](http://host.robots.ox.ac.uk/pascal/VOC/) and [COCO](http://mscoco.org/), and aim to add [ImageNet](http://www.image-net.org/) support soon.
 
 ## Datasets
-To make things easy, we provide a simple VOC dataset loader that inherits `torch.utils.data.Dataset` making it fully compatible with the `torchvision.datasets` [API](http://pytorch.org/docs/torchvision/datasets.html).
+To make things easy, we provide bash scripts to handle the dataset downloads and setup for you.  We also provide simple dataset loaders that inherit `torch.utils.data.Dataset`, making them fully compatible with the `torchvision.datasets` [API](http://pytorch.org/docs/torchvision/datasets.html).
+
+
+### COCO
+Microsoft COCO: Common Objects in Context 
+
+##### Download COCO 2014
+```Shell
+# specify a directory for dataset to be downloaded into, else default is ~/data/
+sh data/scripts/COCO2014.sh
+```
 
 ### VOC Dataset
-##### Download VOC2007 trainval & test
+PASCAL VOC: Visual Object Classes
 
+##### Download VOC2007 trainval & test
 ```Shell
 # specify a directory for dataset to be downloaded into, else default is ~/data/
 sh data/scripts/VOC2007.sh # <directory>
 ```
 
 ##### Download VOC2012 trainval
-
 ```Shell
 # specify a directory for dataset to be downloaded into, else default is ~/data/
 sh data/scripts/VOC2012.sh # <directory>
@@ -149,12 +161,11 @@ jupyter notebook
 - Running `python -m demo.live` opens the webcam and begins detecting!
 
 ## TODO
-We have accumulated the following to-do list, which you can expect to be done in the very near future
+We have accumulated the following to-do list, which we hope to complete in the near future
 - Still to come:
-  * Support for the MS COCO dataset
-  * Support for SSD512 training and testing
-  * Support for training on custom datasets
-
+  * [x] Support for the MS COCO dataset
+  * [ ] Support for SSD512 training and testing
+  * [ ] Support for training on custom datasets
 
 ## References
 - Wei Liu, et al. "SSD: Single Shot MultiBox Detector." [ECCV2016]((http://arxiv.org/abs/1512.02325)).
 
@@ -1,12 +1,33 @@
-from .voc0712 import VOCDetection, AnnotationTransform, detection_collate, VOC_CLASSES
+from .voc0712 import VOCDetection, VOCAnnotationTransform, VOC_CLASSES, VOC_ROOT
+from .coco import COCODetection, COCOAnnotationTransform, COCO_CLASSES, COCO_ROOT
 from .config import *
+import torch
 import cv2
 import numpy as np
 
+def detection_collate(batch):
+    """Custom collate fn for dealing with batches of images that have a different
+    number of associated object annotations (bounding boxes).
+
+    Arguments:
+        batch: (tuple) A tuple of tensor images and lists of annotations
+
+    Return:
+        A tuple containing:
+            1) (tensor) batch of images stacked on their 0 dim
+            2) (list of tensors) annotations for a given image are stacked on
+                                 0 dim
+    """
+    targets = []
+    imgs = []
+    for sample in batch:
+        imgs.append(sample[0])
+        targets.append(torch.FloatTensor(sample[1]))
+    return torch.stack(imgs, 0), targets
+
 
 def base_transform(image, size, mean):
     x = cv2.resize(image, (size, size)).astype(np.float32)
-    # x = cv2.resize(np.array(image), (size, size)).astype(np.float32)
     x -= mean
     x = x.astype(np.float32)
     return x
 
@@ -0,0 +1,180 @@
+from .config import HOME
+import os
+import os.path as osp
+import sys
+import torch
+import torch.utils.data as data
+import torchvision.transforms as transforms
+import cv2
+import numpy as np
+
+COCO_ROOT = osp.join(HOME, 'data/coco/')
+IMAGES = 'images'
+ANNOTATIONS = 'annotations'
+COCO_API = 'PythonAPI'
+INSTANCES_SET = 'instances_{}.json'
+COCO_CLASSES = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
+                'train', 'truck', 'boat', 'traffic light', 'fire', 'hydrant',
+                'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
+                'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
+                'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
+                'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
+                'kite', 'baseball bat', 'baseball glove', 'skateboard',
+                'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
+                'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
+                'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
+                'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
+                'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+                'keyboard', 'cell phone', 'microwave oven', 'toaster', 'sink',
+                'refrigerator', 'book', 'clock', 'vase', 'scissors',
+                'teddy bear', 'hair drier', 'toothbrush')
+
+
+class COCOAnnotationTransform(object):
+    """Transforms a COCO annotation into a Tensor of bbox coords and label index
+    Initilized with a dictionary lookup of classnames to indexes
+    """
+    def __init__(self):
+        self.label_map = get_label_map(osp.join(COCO_ROOT, 'coco_labels.txt'))
+
+    def __call__(self, target, width, height):
+        """
+        Args:
+            target (dict): COCO target json annotation as a python dict
+            height (int): height
+            width (int): width
+        Returns:
+            a list containing lists of bounding boxes  [bbox coords, class idx]
+        """
+        scale = np.array([width, height, width, height])
+        res = []
+        for obj in target:
+            if 'bbox' in obj:
+                bbox = obj['bbox']
+                bbox[2] += bbox[0]
+                bbox[3] += bbox[1]
+                label_idx = self.label_map[obj['category_id']] - 1
+                final_box = list(np.array(bbox)/scale)
+                final_box.append(label_idx)
+                res += [final_box]  # [xmin, ymin, xmax, ymax, label_idx]
+            else:
+                print("no bbox problem!")
+
+        return res  # [[xmin, ymin, xmax, ymax, label_idx], ... ]
+
+
+class COCODetection(data.Dataset):
+    """`MS Coco Detection <http://mscoco.org/dataset/#detections-challenge2016>`_ Dataset.
+    Args:
+        root (string): Root directory where images are downloaded to.
+        set_name (string): Name of the specific set of COCO images.
+        transform (callable, optional): A function/transform that augments the
+                                        raw images`
+        target_transform (callable, optional): A function/transform that takes
+        in the target (bbox) and transforms it.
+    """
+
+    def __init__(self, root, image_set, transform=None,
+                 target_transform=None):
+        sys.path.append(osp.join(root, COCO_API))
+        from pycocotools.coco import COCO
+        self.root = osp.join(root, IMAGES, image_set)
+        self.coco = COCO(osp.join(root, ANNOTATIONS,
+                                  INSTANCES_SET.format(image_set)))
+        self.ids = list(self.coco.imgToAnns.keys())
+        self.transform = transform
+        self.target_transform = target_transform
+        self.name = 'MS COCO ' + image_set
+
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: Tuple (image, target).
+                   target is the object returned by ``coco.loadAnns``.
+        """
+        im, gt, h, w = self.pull_item(index)
+        return im, gt
+
+    def __len__(self):
+        return len(self.ids)
+
+    def pull_item(self, index):
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: Tuple (image, target, height, width).
+                   target is the object returned by ``coco.loadAnns``.
+        """
+        img_id = self.ids[index]
+        target = self.coco.imgToAnns[img_id]
+        ann_ids = self.coco.getAnnIds(imgIds=img_id)
+
+        target = self.coco.loadAnns(ann_ids)
+        path = osp.join(self.root, self.coco.loadImgs(img_id)[0]['file_name'])
+        assert osp.exists(path), 'Image path does not exist: {}'.format(path)
+        img = cv2.imread(osp.join(self.root, path))
+        height, width, _ = img.shape
+        if self.target_transform is not None:
+            target = self.target_transform(target, width, height)
+        if self.transform is not None:
+            target = np.array(target)
+            img, boxes, labels = self.transform(img, target[:, :4],
+                                                target[:, 4])
+            # to rgb
+            img = img[:, :, (2, 1, 0)]
+
+            target = np.hstack((boxes, np.expand_dims(labels, axis=1)))
+        return torch.from_numpy(img).permute(2, 0, 1), target, height, width
+
+    def pull_image(self, index):
+        '''Returns the original image object at index in PIL form
+
+        Note: not using self.__getitem__(), as any transformations passed in
+        could mess up this functionality.
+
+        Argument:
+            index (int): index of img to show
+        Return:
+            cv2 img
+        '''
+        img_id = self.ids[index]
+        path = self.coco.loadImgs(img_id)[0]['file_name']
+        return cv2.imread(osp.join(self.root, path), cv2.IMREAD_COLOR)
+
+    def pull_anno(self, index):
+        '''Returns the original annotation of image at index
+
+        Note: not using self.__getitem__(), as any transformations passed in
+        could mess up this functionality.
+
+        Argument:
+            index (int): index of img to get annotation of
+        Return:
+            list:  [img_id, [(label, bbox coords),...]]
+                eg: ('001718', [('dog', (96, 13, 438, 332))])
+        '''
+        img_id = self.ids[index]
+        ann_ids = self.coco.getAnnIds(imgIds=img_id)
+        return self.coco.loadAnns(ann_ids)
+
+    def __repr__(self):
+        fmt_str = 'Dataset ' + self.__class__.__name__ + '\n'
+        fmt_str += '    Number of datapoints: {}\n'.format(self.__len__())
+        fmt_str += '    Root Location: {}\n'.format(self.root)
+        tmp = '    Transforms (if any): '
+        fmt_str += '{0}{1}\n'.format(tmp, self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
+        tmp = '    Target Transforms (if any): '
+        fmt_str += '{0}{1}'.format(tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
+        return fmt_str
+
+
+def get_label_map(label_file):
+    label_map = {}
+    labels = open(label_file, 'r')
+    for line in labels:
+        ids = line.split(',')
+        label_map[int(ids[0])] = int(ids[1])
+    return label_map
@@ -2,63 +2,35 @@
 import os.path
 
 # gets home dir cross platform
-home = os.path.expanduser("~")
-ddir = os.path.join(home,"data/VOCdevkit/")
-
-# note: if you used our download scripts, this should be right
-VOCroot = ddir # path to VOCdevkit root dir
-
-# default batch size
-BATCHES = 32
-# data reshuffled at every epoch
-SHUFFLE = True
-# number of subprocesses to use for data loading
-WORKERS = 4
-
-
-#SSD300 CONFIGS
-# newer version: use additional conv11_2 layer as last layer before multibox layers
-v2 = {
-    'feature_maps' : [38, 19, 10, 5, 3, 1],
-
-    'min_dim' : 300,
-
-    'steps' : [8, 16, 32, 64, 100, 300],
-
-    'min_sizes' : [30, 60, 111, 162, 213, 264],
-
-    'max_sizes' : [60, 111, 162, 213, 264, 315],
-
-    # 'aspect_ratios' : [[2, 1/2], [2, 1/2, 3, 1/3], [2, 1/2, 3, 1/3],
-    #                    [2, 1/2, 3, 1/3], [2, 1/2], [2, 1/2]],
-    'aspect_ratios' : [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
-
-    'variance' : [0.1, 0.2],
-
-    'clip' : True,
-
-    'name' : 'v2',
+HOME = os.path.expanduser("~")
+
+# for making bounding boxes pretty
+COLORS = ((255, 0, 0, 128), (0, 255, 0, 128), (0, 0, 255, 128),
+          (0, 255, 255, 128), (255, 0, 255, 128), (255, 255, 0, 128))
+
+MEANS = (104, 117, 123)
+
+# SSD300 CONFIGS
+voc = {
+    'feature_maps': [38, 19, 10, 5, 3, 1],
+    'min_dim': 300,
+    'steps': [8, 16, 32, 64, 100, 300],
+    'min_sizes': [30, 60, 111, 162, 213, 264],
+    'max_sizes': [60, 111, 162, 213, 264, 315],
+    'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
+    'variance': [0.1, 0.2],
+    'clip': True,
+    'name': 'VOC',
 }
 
-# use average pooling layer as last layer before multibox layers
-v1 = {
-    'feature_maps' : [38, 19, 10, 5, 3, 1],
-
-    'min_dim' : 300,
-
-    'steps' : [8, 16, 32, 64, 100, 300],
-
-    'min_sizes' : [30, 60, 114, 168, 222, 276],
-
-    'max_sizes' : [-1, 114, 168, 222, 276, 330],
-
-    # 'aspect_ratios' : [[2], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]],
-    'aspect_ratios' : [[1,1,2,1/2],[1,1,2,1/2,3,1/3],[1,1,2,1/2,3,1/3],
-                        [1,1,2,1/2,3,1/3],[1,1,2,1/2,3,1/3],[1,1,2,1/2,3,1/3]],
-
-    'variance' : [0.1, 0.2],
-
-    'clip' : True,
-
-    'name' : 'v1',
+coco = {
+    'feature_maps': [38, 19, 10, 5, 3, 1],
+    'min_dim': 300,
+    'steps': [8, 16, 32, 64, 100, 300],
+    'min_sizes': [21, 45, 99, 153, 207, 261],
+    'max_sizes': [45, 99, 153, 207, 261, 315],
+    'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
+    'variance': [0.1, 0.2],
+    'clip': True,
+    'name': 'COCO',
 }