PaddlePaddle
diff --git a/‎python/paddle/v2/dataset/flowers.py‎
Lines changed: 184 additions & 0 deletions b/‎python/paddle/v2/dataset/flowers.py‎
Lines changed: 184 additions & 0 deletions
diff --git a/‎python/paddle/v2/dataset/tests/flowers_test.py‎
Lines changed: 51 additions & 0 deletions b/‎python/paddle/v2/dataset/tests/flowers_test.py‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎python/paddle/v2/image.py‎
Lines changed: 92 additions & 6 deletions b/‎python/paddle/v2/image.py‎
Lines changed: 92 additions & 6 deletions
@@ -0,0 +1,184 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module will download dataset from
+http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html 
+and parse train/test set intopaddle reader creators.
+
+This set contains images of flowers belonging to 102 different categories. 
+The images were acquired by searching the web and taking pictures. There are a
+minimum of 40 images for each category.
+
+The database was used in:
+
+Nilsback, M-E. and Zisserman, A. Automated flower classification over a large
+ number of classes.Proceedings of the Indian Conference on Computer Vision, 
+Graphics and Image Processing (2008) 
+http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
+
+"""
+import cPickle
+import itertools
+from common import download
+import tarfile
+import scipy.io as scio
+from paddle.v2.image import *
+import os
+import numpy as np
+import paddle.v2 as paddle
+from multiprocessing import cpu_count
+__all__ = ['train', 'test', 'valid']
+
+DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
+LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
+SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
+DATA_MD5 = '52808999861908f626f3c1f4e79d11fa'
+LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
+SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
+
+
+def default_mapper(sample):
+ '''
+ map image bytes data to type needed by model input layer
+ '''
+ img, label = sample
+ img = paddle.image.load_image_bytes(img)
+ img = paddle.image.simple_transform(img, 256, 224, True)
+ return img.flatten().astype('float32'), label
+
+
+def reader_creator(data_file,
+ label_file,
+ setid_file,
+ dataset_name,
+ mapper=default_mapper,
+ buffered_size=1024):
+ '''
+ 1. read images from tar file and 
+ merge images into batch files in 102flowers.tgz_batch/
+ 2. get a reader to read sample from batch file
+ 
+ :param data_file: downloaded data file 
+ :type data_file: string
+ :param label_file: downloaded label file 
+ :type label_file: string
+ :param setid_file: downloaded setid file containing information
+ about how to split dataset
+ :type setid_file: string
+ :param dataset_name: data set name (tstid|trnid|valid)
+ :type dataset_name: string
+ :param mapper: a function to map image bytes data to type 
+ needed by model input layer
+ :type mapper: callable
+ :param buffered_size: the size of buffer used to process images
+ :type buffered_size: int
+ :return: data reader
+ :rtype: callable
+ '''
+ labels = scio.loadmat(label_file)['labels'][0]
+ indexes = scio.loadmat(setid_file)[dataset_name][0]
+ img2label = {}
+ for i in indexes:
+ img = "jpg/image_%05d.jpg" % i
+ img2label[img] = labels[i - 1]
+ file_list = batch_images_from_tar(data_file, dataset_name, img2label)
+
+ def reader():
+ for file in open(file_list):
+ file = file.strip()
+ batch = None
+ with open(file, 'r') as f:
+ batch = cPickle.load(f)
+ data = batch['data']
+ labels = batch['label']
+ for sample, label in itertools.izip(data, batch['label']):
+ yield sample, int(label)
+
+ return paddle.reader.xmap_readers(mapper, reader,
+ cpu_count(), buffered_size)
+
+
+def train(mapper=default_mapper, buffered_size=1024):
+ '''
+ Create flowers training set reader. 
+ It returns a reader, each sample in the reader is 
+ image pixels in [0, 1] and label in [1, 102] 
+ translated from original color image by steps:
+ 1. resize to 256*256
+ 2. random crop to 224*224
+ 3. flatten
+ :param mapper: a function to map sample.
+ :type mapper: callable
+ :param buffered_size: the size of buffer used to process images
+ :type buffered_size: int
+ :return: train data reader
+ :rtype: callable
+ '''
+ return reader_creator(
+ download(DATA_URL, 'flowers', DATA_MD5),
+ download(LABEL_URL, 'flowers', LABEL_MD5),
+ download(SETID_URL, 'flowers', SETID_MD5), 'trnid', mapper,
+ buffered_size)
+
+
+def test(mapper=default_mapper, buffered_size=1024):
+ '''
+ Create flowers test set reader. 
+ It returns a reader, each sample in the reader is 
+ image pixels in [0, 1] and label in [1, 102] 
+ translated from original color image by steps:
+ 1. resize to 256*256
+ 2. random crop to 224*224
+ 3. flatten
+ :param mapper: a function to map sample.
+ :type mapper: callable
+ :param buffered_size: the size of buffer used to process images
+ :type buffered_size: int
+ :return: test data reader
+ :rtype: callable
+ '''
+ return reader_creator(
+ download(DATA_URL, 'flowers', DATA_MD5),
+ download(LABEL_URL, 'flowers', LABEL_MD5),
+ download(SETID_URL, 'flowers', SETID_MD5), 'tstid', mapper,
+ buffered_size)
+
+
+def valid(mapper=default_mapper, buffered_size=1024):
+ '''
+ Create flowers validation set reader. 
+ It returns a reader, each sample in the reader is 
+ image pixels in [0, 1] and label in [1, 102] 
+ translated from original color image by steps:
+ 1. resize to 256*256
+ 2. random crop to 224*224
+ 3. flatten
+ :param mapper: a function to map sample.
+ :type mapper: callable
+ :param buffered_size: the size of buffer used to process images
+ :type buffered_size: int
+ :return: test data reader
+ :rtype: callable
+ '''
+ return reader_creator(
+ download(DATA_URL, 'flowers', DATA_MD5),
+ download(LABEL_URL, 'flowers', LABEL_MD5),
+ download(SETID_URL, 'flowers', SETID_MD5), 'valid', mapper,
+ buffered_size)
+
+
+def fetch():
+ download(DATA_URL, 'flowers', DATA_MD5)
+ download(LABEL_URL, 'flowers', LABEL_MD5)
+ download(SETID_URL, 'flowers', SETID_MD5)
@@ -0,0 +1,51 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.flowers
+import unittest
+
+
+class TestFlowers(unittest.TestCase):
+ def check_reader(self, reader):
+ sum = 0
+ label = 0
+ size = 224 * 224 * 3
+ for l in reader():
+ self.assertEqual(l[0].size, size)
+ if l[1] > label:
+ label = l[1]
+ sum += 1
+ return sum, label
+
+ def test_train(self):
+ instances, max_label_value = self.check_reader(
+ paddle.v2.dataset.flowers.train())
+ self.assertEqual(instances, 1020)
+ self.assertEqual(max_label_value, 102)
+
+ def test_test(self):
+ instances, max_label_value = self.check_reader(
+ paddle.v2.dataset.flowers.test())
+ self.assertEqual(instances, 6149)
+ self.assertEqual(max_label_value, 102)
+
+ def test_valid(self):
+ instances, max_label_value = self.check_reader(
+ paddle.v2.dataset.flowers.valid())
+ self.assertEqual(instances, 1020)
+ self.assertEqual(max_label_value, 102)
+
+
+if __name__ == '__main__':
+ unittest.main()
@@ -1,14 +1,16 @@
 import numpy as np
 try:
  import cv2
-except:
- print(
- "import cv2 error, please install opencv-python: pip install opencv-python"
- )
+except ImportError:
+ cv2 = None
+import os
+import tarfile
+import cPickle
 
 __all__ = [
- "load_image", "resize_short", "to_chw", "center_crop", "random_crop",
- "left_right_flip", "simple_transform", "load_and_transform"
+ "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
+ "random_crop", "left_right_flip", "simple_transform", "load_and_transform",
+ "batch_images_from_tar"
 ]
 """
 This file contains some common interfaces for image preprocess.
@@ -28,6 +30,90 @@
 """
 
 
+def batch_images_from_tar(data_file,
+ dataset_name,
+ img2label,
+ num_per_batch=1024):
+ """
+ Read images from tar file and batch them into batch file.
+ param data_file: path of image tar file
+ type data_file: string
+ param dataset_name: 'train','test' or 'valid'
+ type dataset_name: string
+ param img2label: a dic with image file name as key 
+ and image's label as value
+ type img2label: dic
+ param num_per_batch: image number per batch file
+ type num_per_batch: int
+ return: path of list file containing paths of batch file
+ rtype: string
+ """
+ batch_dir = data_file + "_batch"
+ out_path = "%s/%s" % (batch_dir, dataset_name)
+ meta_file = "%s/%s.txt" % (batch_dir, dataset_name)
+
+ if os.path.exists(out_path):
+ return meta_file
+ else:
+ os.makedirs(out_path)
+
+ tf = tarfile.open(data_file)
+ mems = tf.getmembers()
+ data = []
+ labels = []
+ file_id = 0
+ for mem in mems:
+ if mem.name in img2label:
+ data.append(tf.extractfile(mem).read())
+ labels.append(img2label[mem.name])
+ if len(data) == num_per_batch:
+ output = {}
+ output['label'] = labels
+ output['data'] = data
+ cPickle.dump(
+ output,
+ open('%s/batch_%d' % (out_path, file_id), 'w'),
+ protocol=cPickle.HIGHEST_PROTOCOL)
+ file_id += 1
+ data = []
+ labels = []
+ if len(data) > 0:
+ output = {}
+ output['label'] = labels
+ output['data'] = data
+ cPickle.dump(
+ output,
+ open('%s/batch_%d' % (out_path, file_id), 'w'),
+ protocol=cPickle.HIGHEST_PROTOCOL)
+
+ with open(meta_file, 'a') as meta:
+ for file in os.listdir(out_path):
+ meta.write(os.path.abspath("%s/%s" % (out_path, file)) + "\n")
+ return meta_file
+
+
+def load_image_bytes(bytes, is_color=True):
+ """
+ Load an color or gray image from bytes array.
+
+ Example usage:
+ 
+ .. code-block:: python
+ with open('cat.jpg') as f:
+ im = load_image_bytes(f.read())
+
+ :param bytes: the input image bytes array.
+ :type file: str
+ :param is_color: If set is_color True, it will load and
+ return a color image. Otherwise, it will
+ load and return a gray image.
+ """
+ flag = 1 if is_color else 0
+ file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8)
+ img = cv2.imdecode(file_bytes, flag)
+ return img
+
+
 def load_image(file, is_color=True):
  """
  Load an color or gray image from the file path.