1818import os
1919import xml .etree .ElementTree
2020
21+ from etils .epath import Path
2122from tensorflow_datasets .core .utils .lazy_imports_utils import tensorflow as tf
2223import tensorflow_datasets .public_api as tfds
2324
6465]
6566_SPECIES_CLASSES = ["Cat" , "Dog" ]
6667
67- # List of samples with corrupt image files
68- _SKIP_SAMPLES = [
68+ # List of samples with corrupt image files (mostly wrong format -> we are fixing these during dataset creation)
69+ _CORRUPT_SAMPLES = [
6970 "beagle_116" ,
7071 "chihuahua_121" ,
7172 "Abyssinian_5" ,
8081 "Egyptian_Mau_191"
8182]
8283
84+ _EMPTY_BBOX = tfds .features .BBox (0. , 0. , 0. , 0. )
85+
86+
8387def _get_head_bbox (annon_filepath ):
8488 """Read head bbox from annotation XML file."""
85- with tf . io . gfile . GFile (annon_filepath , "r" ) as f :
89+ with Path (annon_filepath ). open ( "r" ) as f :
8690 root = xml .etree .ElementTree .parse (f ).getroot ()
8791
88- # Disable pytype to avoid attribute-error due to find returning
89- # Optional[Element]
90- # pytype: disable=attribute-error
91- size = root .find ("size" )
92+ size = root .find ("size" ) # pytype: disable=annotation-type-mismatch
9293 width = float (size .find ("width" ).text )
9394 height = float (size .find ("height" ).text )
9495
@@ -106,7 +107,10 @@ def _get_head_bbox(annon_filepath):
106107class Builder (tfds .core .GeneratorBasedBuilder ):
107108 """Oxford-IIIT pet dataset."""
108109
109- VERSION = tfds .core .Version ("3.2.0" )
110+ VERSION = tfds .core .Version ("4.0.0" )
111+ RELEASE_NOTES = {
112+ '4.0.0' : 'Add head bounding boxes. Fix corrupt iamges. Update dataset URL.'
113+ }
110114
111115 def _info (self ):
112116 return self .dataset_info_from_configs (
@@ -118,7 +122,7 @@ def _info(self):
118122 "segmentation_mask" : tfds .features .Image (
119123 shape = (None , None , 1 ), use_colormap = True
120124 ),
121- "head " : tfds .features .BBoxFeature ()
125+ "head_bbox " : tfds .features .BBoxFeature ()
122126 }),
123127 supervised_keys = ("image" , "label" ),
124128 homepage = "http://www.robots.ox.ac.uk/~vgg/data/pets/" ,
@@ -162,13 +166,23 @@ def _split_generators(self, dl_manager):
162166 def _generate_examples (
163167 self , images_dir_path , annotations_dir_path , images_list_file
164168 ):
165- with tf . io . gfile . GFile (images_list_file , "r" ) as images_list :
169+ with Path (images_list_file ). open ( "r" ) as images_list :
166170 for line in images_list :
167171 image_name , label , species , _ = line .strip ().split (" " )
168172
169- # skip corrupt samples
170- if image_name in _SKIP_SAMPLES :
171- continue
173+ image_path = os .path .join (images_dir_path , image_name + ".jpg" )
174+
175+ if image_name in _CORRUPT_SAMPLES :
176+ # some images caused 'Corrupt JPEG data...' messages during training or any other iteration
177+ # recoding them once fixes the issue (discussion: https://github.com/tensorflow/datasets/issues/2188)
178+ with Path (image_path ).open ("rb" ) as image_file :
179+ img_data = image_file .read ()
180+ img_tensor = tf .image .decode_image (img_data )
181+ if tf .shape (img_tensor )[- 1 ] == 4 : # some files have an alpha channel -> remove
182+ img_tensor = img_tensor [:, :, :- 1 ]
183+ img_recoded = tf .io .encode_jpeg (img_tensor )
184+ with Path (image_path ).open ("wb" ) as image_file :
185+ image_file .write (img_recoded .numpy ())
172186
173187 trimaps_dir_path = os .path .join (annotations_dir_path , "trimaps" )
174188 xmls_dir_path = os .path .join (annotations_dir_path , "xmls" )
@@ -181,16 +195,16 @@ def _generate_examples(
181195
182196 try :
183197 head_bbox = _get_head_bbox (os .path .join (xmls_dir_path , xml_name ))
184- except tf . errors . NotFoundError :
198+ except FileNotFoundError as e :
185199 # test samples do not have an annotation file
186- head_bbox = tfds . features . BBox ( 0. , 0. , 0. , 0. )
200+ head_bbox = _EMPTY_BBOX
187201
188202 record = {
189203 "image" : os .path .join (images_dir_path , image_name ),
190204 "label" : int (label ),
191205 "species" : species ,
192206 "file_name" : image_name ,
193207 "segmentation_mask" : os .path .join (trimaps_dir_path , trimap_name ),
194- "head " : head_bbox
208+ "head_bbox " : head_bbox
195209 }
196210 yield image_name , record
0 commit comments