# This YAMLs file is rendered to an HTML file in the same folder.
# This should currently reside at:
# http://www0.cs.ucl.ac.uk/staff/M.Firman/RGBDdatasets/index.html

-   title:  "Datasets capturing single objects"
    desc:   "These datasets capture objects under fairly controlled conditions. Bigbird is the most advanced in terms of quality of image data and camera poses, while the RGB-D object dataset is the most extensive."
    items:
    -   name:       RGBD Object dataset
        linktype:   project_page
        link:       http://rgbd-dataset.cs.washington.edu/
        date:       ICRA 2011
        sensor:     Kinect v1
        type:       Turntable
        desc:       300 instances of household objects, in 51 categories. 250,000 frames in total
        labelling:  Category and instance labelling. Includes auto-generated masks, but no exact 6DOF pose information.
        image:      image05.jpg
        ref:        lai-icra-2011
        qualities:  ['multiframe']

    -   name:       Bigbird dataset
        linktype:   project_page
        link:       http://rll.berkeley.edu/bigbird/
        date:       ICRA 2014
        sensor:     Kinect v1 and DSLR
        type:       Turntable
        desc:       100 household objects
        labelling:  Instance labelling. Masks, ground truth poses, registered mesh.
        image:      image04.jpg
        ref:        singh-icra-2014
        qualities:  ['multiframe', 'model', 'objectpose', 'camerapose']

    -   name:       'A large dataset of object scans'
        linktype:   project_page
        link:       http://redwood-data.org/3dscan/
        date:       '2016'
        sensor:     PrimeSense Carmine
        type:       Turntable
        desc:       "Over 10,000 objects densely scanned and reconstructed. Data captured from the real world by non-technical operators."
        labelling:  Object present in each scan.
        image:      image62.jpg

-   title:  "Segmentation, detection and pose estimation under controlled conditions"
    desc:   "These datasets include objects arranged in controlled conditions. Clutter may be present. CAD or meshed models of the objects may or may not be provided. Most provide 6DOF ground truth pose for each object."
    items:
    -   name:       Object segmentation dataset
        linktype:   project_page
        link:       http://www.acin.tuwien.ac.at/?id=289
        date:       IROS 2012
        sensor:     Kinect v1
        type:
        desc:       111 RGBD images of stacked and occluding objects on table.
        labelling:  Per-pixel segmentation into objects.
        image:      image06.jpg
        ref:        richtsfeld-iros-2012
        qualities:  []

    -   name:       Willow Garage Dataset
        linktype:   project_page
        link:       https://repo.acin.tuwien.ac.at/tmp/permanent/dataset_index.php
        date:       2011
        sensor:     Kinect v1
        type:
        desc:       353 frames of 110 different household objects on a board in controlled environment.
        labelling:  6DOF pose for each object, taken from board calibration. Per-pixel labelling.
        image:      image11.jpg
        ref:        aldoma-www-2012
        qualities:  ['objectpose']

    -   name:       TUW Dataset
        linktype:   project_page
        link:       https://repo.acin.tuwien.ac.at/tmp/permanent/dataset_index.php
        date:       IROS 2014
        sensor:     Kinect v1
        type:
        desc:       15 multi-view sequences of indoor scenes, totalling 163 frames.  Also 3 dynamic scenes. 162 different objects.
        labelling:  6DOF pose for each object
        image:      image63.jpg
        ref:        aldoma-iros-2014
        qualities:  ['objectpose']

    -   name:       "'3D Model-based Object Recognition and Segmentation in Cluttered Scenes'"
        linktype:   project_page
        link:       http://www.csse.uwa.edu.au/~ajmal/recognition.html
        date:       IJCV 2009
        sensor:     Minolta Vivid 910 (only depth, no RGB!)
        type:
        desc:       50 frames  depicting five objects in various occluding poses. No background clutter in any image.
        labelling:  Pose and per-point labelling information. 3D mesh models of each of the 5 objects.
        image:      image13.jpg
        ref:        mian-pami-2006
        qualities:  ['objectpose']

    -   name:       "'A Global Hypotheses Verifcation Method for 3D Object Recognition'"
        linktype:   direct
        link:       http://users.acin.tuwien.ac.at/aaldoma/datasets/ECCV.zip
        date:       ECCV 2012
        sensor:     Kinect v1
        type:
        desc:       50 Kinect frames, library of 35 objects
        labelling:  6DOF pose of each object (unsure how this was gathered). No per-pixel labelling.
        image:      image12.jpg
        ref:        aldoma-eccv-2012
        qualities:  ['objectpose']

    -   name:       "'Model Based Training, Detection and Pose Estimation of Texture-Less 3D Objects in Heavily Cluttered Scenes'"
        linktype:   project_page
        link:       http://campar.in.tum.de/Main/StefanHinterstoisser
        date:       ACCV 2012
        sensor:     Kinect v1
        type:
        desc:       18,000 Kinect images, library of 15 objects.
        labelling:  6DOF pose for each object in each image. No per-pixel labelling.
        image:      image09.jpg
        ref:        hinterstoisser-accv-2012
        qualities:  ['multiframe', 'objectpose', 'camerapose']

    -   name:       "'RGB-D Semantic Segmentation Dataset'"
        linktype:   project_page
        link:       http://vision.deis.unibo.it/fede/kinectDataset.html
        date:       IROS 2011
        sensor:     Kinect v1
        type:
        desc:       16 test scenes of household objects, plus 3D training models for each category.
        labelling:  Semantic segmentation of each scene.
        image:      image54.jpg
        ref:        tombari-iros-2011

    -   name:       "T-LESS: An RGB-D Dataset for 6D Pose Estimation of Texture-less Objects"
        linktype:   project_page
        link:       http://cmp.felk.cvut.cz/t-less/
        date:       WACV 2017
        sensor:     Primesense Carmine 1.09, Microsoft Kinect v2, Canon IXUS 950 IS (the sensors were synchronized)
        type:
        desc:       "30 texture-less objects. 39K training and 10K test images from each sensor. Two types of 3D models for each object - a manually created CAD model and a semi-automatically reconstructed one."
        labelling:  "6DOF pose for each object in each image. Per-pixel labelling can be obtained by rendering of the object models at the ground truth poses."
        image:      image64.jpg
        ref:
        qualities:  ['multiframe', 'objectpose', 'camerapose']


-   title: "Kinect data from the real world"
    desc:
    items:
    -   name:       RGBD Scenes dataset
        linktype:   project_page
        link:       http://rgbd-dataset.cs.washington.edu/dataset/rgbd-scenes/
        date:       ICRA 2011
        sensor:     Kinect v1
        type:
        desc:       Real indoor scenes, featuring objects from the RGBD object dataset 'arranged' on tables, countertops etc. Video sequences of 8 scenes.
        labelling:  Per-frame bounding boxes for objects from RGBD object dataset. Other objects not labelled.
        image:      image18.jpg
        ref:        lai-icra-2011
        qualities:  ['multiframe']

    -   name:       RGBD Scenes dataset v2
        linktype:   project_page
        link:       http://rgbd-dataset.cs.washington.edu/dataset/rgbd-scenes-v2/
        date:       ICRA 2014
        sensor:     Kinect v1
        type:
        desc:       A second set of real indoor scenes featuring objects from the RGBD object dataset. Video sequences of 14 scenes, together with stitched point clouds and camera pose estimations.
        labelling:  Labelling of points in stitched cloud into one of 9 classes (objects and furniture), plus background.
        image:      image30.jpg
        ref:        lai-icra-2014
        qualities:  ['multiframe', 'model', 'camerapose']

    -   name:       "'Object Disappearance for Object Discovery'"
        linktype:   project_page
        link:       http://wiki.ros.org/Papers/IROS2012_Mason_Marthi_Parr
        date:       IROS 2012
        sensor:     Kinect v1
        type:
        desc:       "Three datasets: Small, with still images. Medium, video data from an office environment. Large, video over several rooms. Large dataset has 7 unique objects seen in 397 frames. Data is in ROS bag format."
        labelling:  Ground truth object segmentations.
        image:      image03.jpg
        ref:        mason-iros-2012
        qualities:  ['multiframe']

    -   name:       "'Object Discovery in 3D scenes via Shape Analysis'"
        linktype:   project_page
        link:       http://cs.stanford.edu/people/karpathy/discovery/
        date:       ICRA 2014
        sensor:     Kinect v1
        type:
        desc:       KinFu meshes of 58 very cluttered indoor scenes.
        labelling:  Ground truth binary labelling (object/not object) performed on segments proposed by the algorithm, with no labelling on the mesh.
        image:      image02.jpg
        ref:        karpathy-icra-2013
        qualities:  ['model']

    -   name:       Cornell-RGBD-Dataset
        linktype:   project_page
        link:       http://pr.cs.cornell.edu/sceneunderstanding/data/data.php
        date:       NIPS 2011
        sensor:     Kinect v1
        type:
        desc:       Multiple RGBD frames from 52 indoor scenes. Stitched point clouds (using RGBDSLAM).
        labelling:  Per-point object-level labelling on the stitched clouds.
        image:      image10.jpg
        ref:        koppula-nips-2011
        qualities:  ['multiframe', 'camerapose']

    -   name:       NYU Dataset v1
        linktype:   project_page
        link:       http://cs.nyu.edu/~silberman/datasets/nyu_depth_v1.html
        date:       ICCV 2011 Workshop on 3D Representation and Recognition
        sensor:     Kinect v1
        type:
        desc:       Around 51,000 RGBD frames from indoor scenes such as bedrooms and living rooms. Note that the updated NYU v2 dataset is typically used instead of this earlier version.
        labelling:  Dense multi-class labelling for 2283 frames.
        image:      image28.jpeg
        ref:        silberman-iccv-2011
        qualities:  ['multiframe']
        # labelling:  Dense semantic

    -   name:       NYU Dataset v2
        linktype:   project_page
        link:       http://cs.nyu.edu/~silberman/datasets/nyu_depth_v2.html
        date:       ECCV 2012
        sensor:     Kinect v1
        type:
        desc:       ~408,000 RGBD images from 464 indoor scenes, of a somewhat larger diversity than NYU v1. Per-frame accelerometer data.
        labelling:  Dense labelling of objects at a class and instance level for 1449 frames. Instance labelling is not carried across scenes. This 1449 subset is the dataset typically used in experiments.
        image:      image14.jpg
        ref:        silberman-eccv-2012
        qualities:  ['multiframe']
        # labelling:  Dense semantic

    -   name:       "'Object Detection and Classification from Large-Scale Cluttered Indoor Scans'"
        linktype:   project_page
        link:       http://www.ifi.uzh.ch/vmml/publications/ObjDetandClas.html
        date:       Eurographics 2014
        sensor:     Faro Lidar scanner
        type:
        desc:       Faro lidar scans of ~40 academic offices, with 2-3 scans per office. Each scan is 0.25GB-2GB. Scans include depth and RGB.
        labelling:  No labelling present. The labelling shown in the exemplar image is their algorithm output.
        image:      image16.jpg
        ref:        mattausch-eurographics-2014
        qualities:  ['lidar', 'camerapose']

    -   name:       SUN3D
        linktype:   project_page
        link:       http://sun3d.cs.princeton.edu/
        date:       ICCV 2013
        sensor:     Kinect v1
        type:
        desc:       Videos of indoor scenes, registered into point clouds.
        labelling:  Polygons of semantic class and instance labels on frames propagated through video.
        image:      image17.jpg
        ref:        xiao-iccv-2013
        qualities:  ['multiframe', 'pose']
        # labelling:  Dense semantic

    -   name:       SUN RGB-D
        linktype:   project_page
        link:       http://rgbd.cs.princeton.edu/
        date:       CVPR 2015
        sensor:     Kinect v1, Kinect v2, Intel RealSense and Asus Xtion Live Pro
        type:
        desc:       New images, plus images taken from NYUv2, B3DO and SUN3D. All of indoor scenes.
        labelling:  "10,335 images with polygon annotation, and 3D bounding boxes around objects"
        image:      image41.jpg
        ref:        song-cvpr-2015
        qualities:  []
        # labelling:  Dense semantic

    -   name:       "B3DO: Berkeley 3-D Object Dataset"
        linktype:   project_page
        link:       http://kinectdata.com/
        date:       ICCV Workshop on Consumer Depth Cameras in Computer Vision 2011
        sensor:     Kinect v1
        type:
        desc:       Aim is to crowdsource collection of Kinect data, to be included in future releases. Version 1 has 849 images, from 75 scenes.
        labelling:  Bounding box labelling at a class level.
        ref:        janoch-iccv-2011
        image:      image00.jpg
        qualities:  []
        # labelling:  Bounding box

    -   name:       "Kinect RGBD Dataset for Category Modeling"
        linktype:   project_page
        link:       http://shiba.iis.u-tokyo.ac.jp/song/?page_id=343
        date:       CVPR 2013
        sensor:     Kinect v1
        type:
        desc:       900 RGBD images from seven different categories. Some images naturally captured, others with specifically arranged objects.
        labelling:  Category of dominant object in each image
        image:      image44.jpg
        ref:        zhang-cvpr-2013
        qualities:  []

    # -   name: Toy dataset
    #     date: VISAPP 2016
    #     sensor: Kinect v1
    #     desc: 449 RGB-D images of multiple toys on tabletop.
    #     labelling: Per-pixel segmentation into objects.
    #     link:       http://irobotics.aalto.fi/software-and-data/toy-dataset
    #     linktype:   project_page
    #     image:      image61.jpg

    -   name: "ViDRILO: The Visual and Depth Robot Indoor Localization with Objects information dataset"
        date:   IJRR 2015
        sensor: Kinect v1
        desc: "Five sequences (total 22454 frames) captured from a robot moving through an office environment"
        labelling: "Scene type of each frame, plus presence/absence of each of a set of 15 objects."
        link:       http://www.rovit.ua.es/dataset/vidrilo/
        linktype:   project_page
        image:      image66.jpg

    -   name:       SceneNN
        linktype:   project_page
        link:       http://www.scenenn.net
        date:       3DV 2016
        sensor:     Asus Xtion PRO
        type:
        desc:       "Videos of indoor scenes, registered into triangle meshes."
        labelling:  "Per-vertex and per-pixel instance segmentation, bounding boxes and object poses."
        image:      image71.jpg
        ref:        hua-3dv-2016
        qualities:  ['multiframe', 'pose']

    -   name:       "GMU Kitchen Dataset"
        linktype:   project_page
        link:       http://cs.gmu.edu/~robot/gmu-kitchens.html
        date:       3DV 2016
        sensor:     Kinect v2
        type:
        desc:       "9 video sequences captured from 4 different kitchens, each containing objects from the BigBIRD dataset."
        labelling:  "Per-frame camera pose, 3D point clouds, object bounding box annotations and point labels."
        image:      image72.jpg
        ref:


    -   name:       "Stanford 2D-3D-Semantics Dataset"
        linktype:   project_page
        link:       http://buildingparser.stanford.edu/dataset.html
        date:       arXiv 2017
        sensor:     Matterport Camera (360 degree rotation RGBD sensor)
        type:
        desc:       "360 degree RGBD images captured from 6 large areas in municipal buildings, together with mesh and point cloud reconstructions."
        labelling:  "Semantic labelling on the mesh (13 classes, plus instance labels), and 3D volumentric reconstruction labels "
        image:      image74.jpg
        ref:


    -   name:       "Active Vision Dataset (AVD)"
        linktype:   project_page
        link:       http://cs.unc.edu/~ammirato/active_vision_dataset_website/index.html
        date:       ICRA 2017
        sensor:     Kinect v2
        type:
        desc:       "Dense sampling of images in home and office scenes, captured from a robot. Dataset designed for simulation of motion and instance detection."
        labelling:  "Per-frame camera pose, object instance bounding boxes, movement pointers between images."
        image:      image78.jpg
        ref:


    -   name:       "ScanNet"
        linktype:   project_page
        link:       http://www.scan-net.org/
        date:       CVPR 2017
        sensor:     Structure sensor
        type:
        desc:       "2.5 million frames from 1513 scenes"
        labelling:  "Automatically computed (and human verified) camera poses and surface reconstructions. Instance and semantic segmentations provided on reconstructed mesh. 3D CAD models + alignment also provided for each scene."
        image:      image77.png
        ref:





-   title: "SLAM, registration and reconstruction"
    desc:
    items:
    -   name:       TUM Benchmark Dataset
        linktype:   project_page
        link:       http://vision.in.tum.de/data/datasets/rgbd-dataset
        date:       IROS 2012
        sensor:     Kinect v1
        type:
        desc:       "Many different scenes and scenarios for tracking and mapping, including reconstruction, robot kidnap etc."
        labelling:  "6DOF ground truth from motion capture system with 10 cameras."
        image:      image15.jpg
        ref:        sturm-iros-2012
        qualities:  ['multiframe', 'gtcamerapose']

    -   name:       Microsoft 7-scenes dataset
        linktype:   project_page
        link:       http://research.microsoft.com/en-us/projects/7-scenes/
        date:       CVPR 2013
        sensor:     Kinect v1
        type:
        desc:       Kinect video from 7 indoor scenes.
        labelling:  "6DOF 'ground truth' from Kinect Fusion."
        image:      image08.jpg
        ref:        shotton-cvpr-2013
        qualities:  ['multiframe', 'camerapose', 'model']

    -   name:       IROS 2011 Paper Kinect Dataset
        linktype:   project_page
        link:       http://projects.asl.ethz.ch/datasets/doku.php?id=Kinect:iros2011Kinect
        date:       IROS 2011
        sensor:     Kinect v1
        type:
        desc:       Lab-based setup. The aim seems to be to track the motion of camera.
        labelling:  6DOF ground truth from Vicon system
        image:      image01.jpg
        ref:        pomerleau-iros-2011
        qualities:  ['multiframe', 'gtcamerapose']

    -   name:       "'When Can We Use KinectFusion for Ground Truth Acquisition?'"
        linktype:   project_page
        link:       http://hci.iwr.uni-heidelberg.de//Benchmarks/document/kinectFusionCapture/
        date:       "Workshop on Color-Depth Camera Fusion in Robotics, IROS 2012"
        sensor:     Kinect v1
        type:
        desc:       "A set of 57 scenes, captured from natural environments and from artificial shapes. Each scene has a 3D mesh, volumetric data and registered depth maps."
        labelling:  "Frame-to-frame transformations as computed from KinectFusion. The 'office' and 'statue' scenes have LiDAR ground truth."
        image:      image24.jpg
        ref:        meister-iros-2012
        qualities:  ['multiframe', 'camerapose', 'model']

    -   name:       DAFT Dataset
        linktype:   project_page
        link:       http://ias.cs.tum.edu/people/gossow/rgbd
        date:       ICPR 2012
        sensor:     Kinect v1
        type:
        desc:       A few short sequences of different planar scenes captured under various camera motions. Used to demonstrate repeatability of feature points under transformations.
        labelling:  Camera motion type. 2D homographies between the planar scene in different images.
        image:      image29.jpg
        ref:        gossow-icpr-2012
        qualities:  ['multiframe']

    -   name:       "'Automatic Registration of RGB-D Scans via Salient Directions'"
        linktype:   project_page
        link:       http://www.cvg.ethz.ch/research/saldir-rgbd-registration/
        date:       ICCV 2013
        sensor:     RGBD Laser scanner
        type:
        desc:       "Several laser scans taken from each of a European church, city and castle scenes."
        labelling:  "Results of the authors' registration algorithm."
        image:      image34.jpg
        qualities:  ['multiframe', 'lidar']

    -   name:       "Stanford 3D Scene Dataset"
        linktype:   project_page
        link:       http://qianyi.info/scenedata.html
        date:       SIGGRAPH 2013
        sensor:     Xtion Pro Live (Kinect v1 equivalent)
        type:
        desc:       RGBD videos of six indoor and outdoor scenes, together with a dense reconstruction of each scene.
        labelling:  Estimated camera pose for each frame. No ground truth pose, so not ideal for quantitative evaluation.
        image:      image37.jpg
        ref:        zhou-siggraph-2013
        qualities:  ['multiframe', 'camerapose']

    -   name:       "CoRBS: Comprehensive RGB-D Benchmark for SLAM using Kinect v2"
        linktype:   project_page
        link:       http://corbs.dfki.uni-kl.de/
        date:       WACV 2016
        sensor:     Kinect v2
        type:
        desc:       "Twenty sequences from four scenes, with ground truth for trajectory and geometry."
        labelling:  "6DOF ground truth trajectory from motion capture system and ground truth geometry from active scanner."
        image:      image60.jpg
        ref:        wasenmuller-wacv-2016
        qualities:  ['multiframe', 'gtcamerapose', 'model', 'objectpose']


    -   name:       "'MobileRGBD, An open benchmark corpus for mobile RGB-D related algorithms'"
        linktype:   project_page
        link:       http://mobilergbd.inrialpes.fr/
        date:       ICARCV 2014
        sensor:     "Kinect v2, lidar"
        type:
        desc:       "9.5 hours of recording in 4 different environments, comprising RGBD, infrared and LIDAR. Environments have dummies placed to simulate humans."
        labelling:  "Position orientation and speed of the robot at each frame, actual ground plane, height and angle of the Kinect and dummies 3D position in the room."
        image:      image75.jpg


    -   name:       "Depth Reconstruction Occlusionless Temporal (DROT) Dataset"
        linktype:   project_page
        link:       https://drotman-technion.wixsite.com/drotdataset
        date:       3DV 2016
        sensor:     Kinect v1, v2 and RealSense R200
        type:
        desc:       "Five stop-motion sequences of 11-30 frames each"
        labelling:  "Registrations between each camera, together with ground truth depth from David SLS-2 3D scanner."
        image:      image76.png


    -   name: "CVSSP Dynamic RGBD Modelling"
        linktype:   project_page
        link:       http://cvssp.org/projects/4d/dynamic_rgbd_modelling/
        date:       Circuits and Systems for Video Technology 2018
        desc: "Eight RGBD sequences of general dynamic scenes captured using the Kinect V1/V2 as well as two synthetic sequences. Designed for non-rigid reconstruction."
        sensor:     Kinect v1, v2 and synthetic
        type:
        labelling:  "None"
        image:      image79.png


    -   name:       "'Shading-based Refinement on Volumetric Signed Distance Functions'"
        linktype:   project_page
        link:       http://graphics.stanford.edu/projects/vsfs/
        date:       TOG 2015
        sensor:     PrimeSense
        type:
        desc:       "Four RGBD sequences of small statues and artefacts."
        labelling:  "6DOF inferred camera trajectory, plus fused (and refined) reconstructions."
        image:      image65.jpg

-   title: "Synthetic"
    desc:   Synthetic datasets get their whole own section as they typically can be used for multiple purposes.
    items:

    -   name:       ICL-NUIM Dataset
        linktype:   project_page
        link:       http://www.doc.ic.ac.uk/~ahanda/VaFRIC/iclnuim.html
        date:       ICRA 2014
        sensor:     Kinect v1 (synthesised)
        type:
        desc:       "Eight synthetic RGBD video sequences: four from a office scene and four from a living room scene. Simulated camera trajectories are taken from a Kintinuous output from a sensor being moved around a real-world room."
        labelling:  Camera trajectories for each video. Geometry of the living room scene as an .obj file.
        image:      image31.jpg
        ref:        handa-icra-2014
        qualities:  ['multiframe', 'gtcamerapose', 'model']

    -   name:       Augmented ICL-NUIM Dataset
        linktype:   project_page
        link:       http://redwood-data.org/indoor/dataset.html
        date:       CVPR 2015
        sensor:     Kinect v1 (synthesised)
        type:
        desc:       "An augmentation of the ICL-NUIM dataset, with camera paths added to allow it to be used for scene reconstruction."
        labelling:  "In addition to UCL-NUIM: New camera paths for each scene, plus a noise model and a point based surface model to enable reconstruction evaluation."
        image:      image50.jpg
        ref:        choi-cvpr-2015
        qualities:  ['multiframe', 'gtcamerapose', 'model']

    -   name:       SceneNet RGB-D
        linktype:   project_page
        link:       https://robotvault.bitbucket.io/scenenet-rgbd.html?
        date:       arXiv 2016
        sensor:     Kinect v1 (synthesised)
        type:
        desc:       "5 million images rendered of 16,895 indoor scenes. Room configuration randomly generated with physics simulator."
        labelling:  "Camera pose, plus per-pixel instance, class labelling and optical flow."
        image:      image69.jpg
        ref:
        qualities:

    -   name:       SUNCG
        linktype:   project_page
        link:       http://suncg.cs.princeton.edu/
        date:       arXiv 2016
        sensor:     User choice
        type:
        desc:       "45,622 scenes with manually created room and furniture layouts. Images can be rendered from the geometry, but are not provided by default."
        labelling:  "Object semantic class and instance labelling."
        image:      image70.jpg
        ref:
        qualities:


-   title: "Tracking"
    desc:   See also some of the human datasets for body and face tracking.
    items:
    -   name:       Princeton Tracking Benchmark
        linktype:   project_page
        link:       http://tracking.cs.princeton.edu/dataset.html
        date:       ICCV 2013
        sensor:     Kinect v1
        type:
        desc:       100 RGBD videos of moving objects such as humans, balls and cars.
        labelling:  Per-frame bounding box covering target object only.
        image:      image38.jpg
        ref:        song-iccv-2012
        qualities:  ['multiframe']
        # labelling:  'boundingbox'

-   title:  "Datasets involving humans: Body and hands"
    desc:
    items:
    -   name:       "Cornell Activity Datasets: CAD-60 and CAD-120"
        linktype:   project_page
        link:       http://pr.cs.cornell.edu/humanactivities/data.php
        date:       "PAIR 2011/IJRR 2013"
        sensor:     Kinect v1
        type:
        desc:       "Videos of humans performing activities"
        labelling:  "Each video given at least one label, such as eating, opening or working on computer. Skeleton joint position and orientation labelled on each frame."
        image:      image19.jpg
        ref:        ["koppula-ijrr-2013", "sung-pair-2011"]

    -   name:       RGB-D Person Re-identification Dataset
        linktype:   project_page
        link:       http://www.iit.it/en/datasets-and-code/datasets/rgbdid.html
        date:       First International Workshop on Re-Identification 2012
        sensor:     Kinect v1
        type:
        desc:       Front and back poses of 79 people walking forward in different poses.
        labelling:  "In addition to the per-person label, the dataset provides foreground masks, skeletons, 3D meshes and an estimate of the floor."
        image:      image20.jpg
        ref:        barbosa-eccvw-2012

    -   name:       "Sheffield KInect Gesture (SKIG) Dataset"
        linktype:   project_page
        link:       http://lshao.staff.shef.ac.uk/data/SheffieldKinectGesture.htm
        date:       IJCAI 2013
        sensor:     Kinect v1
        type:
        desc:       "Total of 1080 Kinect videos of six people performing one of 10 hand gesture sequences, such as 'triangle' or 'comehere'. Sequences captured under a variety of illumination and background conditions."
        labelling:  The gesture being performed in each sequence.
        image:      image26.jpg
        ref:        liu-ijcai-2013

    -   name:       RGB-D People Dataset
        linktype:   project_page
        link:       http://www2.informatik.uni-freiburg.de/~spinello/RGBD-dataset.html
        date:       IROS 2011
        sensor:     Kinect v1
        type:
        desc:       3000+ frames of people walking and standing in a university hallway, captured from three Kinects.
        labelling:  "Per-frame bounding box annotations of individual people, together with a `visibility' measure."
        image:      image22.jpg
        ref:        ["luber-iros-2011", "spinello-iros-2011"]

    -   name:       50 Salads
        linktype:   project_page
        link:       http://cvip.computing.dundee.ac.uk/datasets/foodpreparation/50salads/
        date:       UbiComp 2013
        sensor:     Kinect v1
        type:
        desc:       Over 4 hours of video of 25 people preparing 2 mixed salads each
        labelling:  Accelerometer data from sensors attached to cooking utensils, and labelling of steps in the recipes.
        image:      image23.jpg
        ref:        stein-ubicomp-2013

    -   name:       Microsoft Research Cambridge-12 Kinect gesture data set
        linktype:   project_page
        link:       http://research.microsoft.com/en-us/um/cambridge/projects/msrc12/
        date:       CHI 2012
        sensor:     Kinect v1
        type:
        desc:       594 sequences and 719,359 frames of 30 people performing 12 gestures.
        labelling:  Gesture performed in each video sequence, plus motion tracking of human joint locations.
        image:      image27.jpg
        ref:        fothergill-chi-2012

    -   name:       UR Fall Detection Dataset
        linktype:   project_page
        link:       http://fenix.univ.rzeszow.pl/~mkepski/ds/uf.html
        date:       Computer Vision Theory and Applications 2014
        sensor:     Kinect v1
        type:
        desc:       Videos of people falling over. Consists of 60 sequences recorded with two Kinects.
        labelling:  Accelerometer data from device attached to subject.
        image:      image32.jpg
        ref:        kwolek-cmpb-2014

    -   name:       RGBD-HuDaAct
        linktype:   project_page
        link:       http://adsc.illinois.edu/sites/default/files/files/ADSC-RGBD-dataset-download-instructions.pdf
        date:       ICCV Workshops 2011
        sensor:     Kinect v1
        type:
        desc:       "30 different humans each performing the same 12 activities, e.g. 'eat a meal'. Also include a random 'background' activity. All performed in a lab environment. Around 5,000,000 frames in total."
        labelling:  Which activity being performed in each sequence.
        image:      image35.jpg
        ref:        ni-iccvw-2011

    -   name:       Human3.6M
        linktype:   project_page
        link:       http://vision.imar.ro/human3.6m
        date:       PAMI 2014
        sensor:     SwissRanger time-of-flight (+ 2D cameras)
        type:
        desc:       11 different humans performing 17 different activities. Data comes from four calibrated video cameras, 1 time-of-flight camera and (static) 3D laser scans of the actors.
        labelling:  2D and 3D human joint positions, obtained from a Vicon motion capture system.
        image:      image36.jpg
        ref:        ionescu-pami-2014

    -   name:       TST Fall detection dataBase
        linktype:   project_page
        link:       http://www.tlc.dii.univpm.it/blog/databases4kinect
        date:       ICT Innovations 2015
        sensor:     Kinect v2
        type:
        desc:       Videos of 11 different humans performing activities of daily living and falling over in various ways.
        labelling:  Activity performed, acceleration data, skeleton joint locations.
        image:      image42.jpg
        ref:        gasparrini-ictinv-2015

    -   name:       TST TUG dataBase
        linktype:   project_page
        link:       http://www.tlc.dii.univpm.it/blog/databases4kinect
        date:       IEEE ICC 2015
        sensor:     Kinect v2
        type:
        desc:       Videos of 20 different humans standing up and walking around
        labelling:  Acceleration data, skeleton joint locations.
        image:      image43.jpg
        ref:        cippitelli-icc-2015

    -   name:       TST Intake Monitoring dataset
        linktype:   project_page
        link:       http://www.tlc.dii.univpm.it/blog/databases4kinect
        date:
        sensor:     Kinect v1
        type:
        desc:       Videos of 35 different humans simulating food intake actions
        labelling:  Skeleton joint locations estimated by three different algorithms. Ground truth positions of hands and head joints.
        image:      image51.jpg

    -   name:       MSR 3D Online Action Dataset
        linktype:   project_page
        link:       http://research.microsoft.com/en-us/um/people/zliu/actionrecorsrc/
        date:       ACCV 2014
        sensor:     Kinect v1
        type:
        desc:       Videos of human-object interaction, in seven categories, plus a negative class.
        labelling:  Activity being performed in each video.
        image:      image45.jpg
        ref:        yu-accv-2014

    -   name:       MSRGesture3D
        linktype:   project_page
        link:       http://research.microsoft.com/en-us/um/people/zliu/actionrecorsrc/
        date:       EUSIPCO 2012, ECCV 2012
        sensor:     Kinect v1
        type:
        desc:       10 humans performing 12 American Sign Language gestures, each gesture being performed 2-3 times. The hands have been segmented.
        labelling:  The gesture being performed in each video.
        image:      image46.jpg
        ref:        kurakin-eusipco-2012

    -   name:       MSRDailyActivity3D
        linktype:   project_page
        link:       http://research.microsoft.com/en-us/um/people/zliu/actionrecorsrc/
        date:       CVPR 2012
        sensor:     Kinect v1
        type:
        desc:       10 humans performing 16 activities, e.g. read book, play guitar. Each activity performed in sitting and standing positions.
        labelling:  Activity being performed, plus 20 joint locations of skeleton positions.
        image:      image47.jpg
        ref:        wang-cvpr-2012

    -   name:       MSR Action3D Dataset
        linktype:   project_page
        link:       http://research.microsoft.com/en-us/um/people/zliu/actionrecorsrc/
        date:
        sensor:     ?? (similar to Kinect, with 320x240 resolution)
        type:
        desc:       Videos of 10 humans performing 20 action types. Each subject performs each action 2 or 3 times.
        labelling:  Activity being performed, plus 20 joint locations of skeleton positions.
        image:      image48.jpg
        ref:        li-cvprw-2010

    -   name:       Northwestern-UCLA Multiview Action 3D Dataset
        linktype:   project_page
        link:       http://users.eecs.northwestern.edu/~jwa368/my_data.html
        date:
        sensor:     Kinect v1
        type:
        desc:       Three Kinects used to simulatneously record 10 actions each being performed by 10 humans
        labelling:  Activity being performed
        image:      image49.jpg
        ref:        wang-cvpr-2014

    -   name:       "UTD Multimodal Human Action Dataset (UTD-MHAD)"
        linktype:   project_page
        link:       http://www.utdallas.edu/~kehtar/UTD-MHAD.html
        date:       ICIP 2015
        sensor:     Kinect v1
        type:
        desc:       Eight different humans performing 27 actions in a controlled environment, each action repeated 4 times. The humans wore accelerometers.
        labelling:  Action being performed, accelerometer data associated with each video.
        image:      image52.jpg
        ref:        chen-icip-2015
        qualities:  []

    -   name:       "Dataset of a human performing daily life activities in a scene with occlusions"
        linktype:   project_page
        link:       https://team.inria.fr/larsen/software/datasets/
        date:       IROS 2015
        sensor:     Kinect v1
        type:
        desc:       12 RGB-D video sequences of a person performing activities with obstacles occluding the view from the Kinect
        labelling:  15 position markers of the 3D joint location from a MoCap system
        image:      image53.jpg
        ref:        dib-iros-2015

    -   name:       Background activity dataset
        linktype:   project_page
        link:       http://www.dgp.toronto.edu/%CB%9Cdustin/backgroundactivity/
        date:       2015
        desc:       "Humans in TV-watching setup, performing occasional gestures. 52 person-hours of video in total, with 13 groups of 4 humans."
        labelling:  Gestures performed. Mocap for all humans.
        sensor:     Kinect v1
        ref:        freeman-arxiv-2015
        image:      image55.jpg

#     -   name:       "Pose Estimation For A Partially Observable Human
# Body From RGB-D Cameras"
#         linktype:   project_page
#         link:       https://team.inria.fr/larsen/software/datasets/
#         date:       IROS 2015,
#         sensor:     '?'
#         desc:       "12 videos, each 50 seconds, of a human performing various actions, where some actions are occluded by objects"
#         labelling:  "3D motion capture on the human in the scene. Many actions performed, but no specific labels given."
#         ref:        dib-iros-2015
#         image:      image56.jpg

    -   name:       "ChaLearn gesture challenge dataset"
        linktype:   project_page
        link:       http://gesture.chalearn.org/data
        date:       2012
        sensor:     Kinect v1
        desc:       "Originally designed for one-shot learning, for a Kaggle competition."
        labelling:  "Action being performed in a subset of the videos. Body part annotations"
        ref:        guyon-adiaa-2012
        image:      image57.jpg

    -   name:       "Montalbano gesture dataset"
        linktype:   project_page
        link:       http://gesture.chalearn.org/2013-multi-modal-challenge/data-2013-challenge
        date:       ECCV 2014
        sensor:     Kinect v1
        desc:       "13858 sequences each depicting one of 27 humans performing one of 20 Italian gestures."
        labelling:  "Gesture being performed in each sequence."
        ref:        escalera-eccv-2014
        image:      image58.jpg

    -   name:       "Berkeley Multimodal Human Action Database"
        linktype:   project_page
        link:       http://tele-immersion.citris-uc.org/berkeley_mhad
        date:       WACV 2014
        sensor:     Kinect v1
        desc:       "660 videos each of one of 12 humans, each performing one of 11 actions in a MoCap studio."
        labelling:  "Action being performed. 3D skeleton positions from MoCap, also stereo cameras and accelerometer data."
        ref:        ofli-wacv-2013
        image:      image59.jpg

    -   name:       "Grasp Understanding Dataset (GUN-71)"
        linktype:   project_page
        link:       http://www.gregrogez.net/research/egovision4health/gun-71/
        date:       ICCV 2015
        sensor:     Kinect v2
        desc:       "12,000 images of human hands manipulating one of 28 objects, captured from a chest-mounted RGBD camera.
        Eight different subjects (4 males and 4 females) in 5 different houses."
        labelling:  "Each image labelled with one of 71 fine-grained grasps. "
        ref:
        image:      image67.jpg

    -   name:       "Manipulation Action (MANIAC) Dataset"
        linktype:   project_page
        link:       http://www.dpi.physik.uni-goettingen.de/cns/index.php?page=maniac-dataset
        date:       RAS 2014
        sensor:     Kinect v1
        desc:       "Videos of eight manipulation actions, each recorded 15 times (with 5 different humans.). Also videos of chained sequences of actions."
        labelling:  "Action being performed, per-pixel frame labelling into objects, hands etc."
        ref:
        image:      image68.jpg

    -   name:       "TVPR (Top View Person Re-identification) Dataset"
        linktype:   project_page
        link:       http://vrai.dii.univpm.it/re-id-dataset
        date:       ICPR 2016
        sensor:     Asus Xtion PRO Live
        desc:       "Videos of 100 humans recorded in a top-down configuration."
        labelling:  "Person ID in each video"
        ref:
        image:      image73.jpg

    # -   name:       "HumanEva"
    #     linktype:   project_page
    #     link:       http://humaneva.is.tue.mpg.de/
    #     date:       IJCV 2010
    #     sensor:
    #     desc:       "Videos 4 humans performing 6 actions, simulatneously captured from seven calibrarted. Videos of 100 humans recorded in a top-down configuration."
    #     labelling:  "Person ID in each video"
    #     ref:
    #     image:      image78.jpg


-   title:  "Datasets involving humans: Head and face"
    desc:
    items:
    -   name:       Biwi Kinect Head Pose Database
        linktype:   project_page
        link:       http://www.vision.ee.ethz.ch/~gfanelli/head_pose/head_forest.html#db
        date:       IJCV 2013
        sensor:     Kinect v1
        type:
        desc:       15K images of 20 different people moving their heads in different directions.
        labelling:  "3D position of the head and its rotation, acquired using 'faceshift' software."
        image:      image21.jpg
        ref:        fanelli-tom-2010

    -   name:       Eurecom Kinect Face Dataset
        linktype:   project_page
        link:       http://rgb-d.eurecom.fr/
        date:       ACCV Workshop on Computer Vision with Local Binary Pattern Variants 2012
        sensor:     Kinect v1
        type:
        desc:       Images of faces captured under laboritory conditions, with different levels of occlusion and illumination, and with different facial expressions.
        labelling:  In addition to occlusion and expression type, each image is manually labelled with the position of six facial landmarks.
        image:      image25.jpg
        ref:        min-smc-2014

    -   name:       3D Mask Attack Dataset
        linktype:   project_page
        link:       https://www.idiap.ch/dataset/3dmad
        date:       "Biometrics: Theory, Applications and Systems 2013"
        sensor:     Kinect v1
        type:
        desc:       "76500 frames of 17 different people, facing the camera against a plain background. Two sets of the data are captured on the real subjects two weeks apart, while the final set consists of a single person wearing a fake face mask of the 17 different people."
        labelling:  "Which user is in each frame. Which images are real and which are spoofed. Manually labelled eye positions."
        image:      image33.jpg
        ref:        erdogmus-btas-2013

    -   name:       Biwi 3D Audiovisual Corpus of Affective Communication - B3D(AC)^2
        linktype:   project_page
        link:       http://www.vision.ee.ethz.ch/datasets/b3dac2.en.html
        date:       IEEE Transactions on Multimedia 2010
        sensor:     Custom active stereo setup
        type:
        desc:       "Simultaneous audio and visual recordings of 1109 sentences spoken by 14 different people. Each sentence spoken neutrally and with an emotion. Depth images converted to 3D mesh."
        labelling:  "Perceived emotions for each recording. Audio labelled with phonemes."
        image:      image39.jpg
        ref:        fanelli-dadm-2011

    -   name:       ETH Face Pose Range Image Data Set
        linktype:   project_page
        link:       http://www.vision.ee.ethz.ch/datasets/headposeCVPR08/
        date:       CVPR 2008
        sensor:     Custom active stereo setup
        type:
        desc:       10,545 images of 20 different people turning their head.
        labelling:  Nose potition and coordinate frame at the nose.
        image:      image40.jpg
        ref:        breitenstein-cvpr-2008

    # -   name:       A Full-Body Gesture Database for Automatic Gesture Recognition
    #     linktype:   email_in_paper
    #     link:       http://dl.acm.org/citation.cfm?id=1126345
    #     date:       7th International Conference on Automatic Face and Gesture Recognition 2006
    #     sensor:     Eagle digital camera
    #     type:
    #     desc:       3D motion data from a MoCap setup of humans performing various motions.
    #     labelling:
    #     image:

    -   name:
        linktype:   project_page
        link:
        date:
        sensor:
        type:
        desc:
        labelling:
        image:
