Janspiry
diff --git a/‎data/prepare_data.py‎
Lines changed: 79 additions & 29 deletions b/‎data/prepare_data.py‎
Lines changed: 79 additions & 29 deletions
@@ -1,13 +1,17 @@
 import argparse
 from io import BytesIO
 import multiprocessing
+from multiprocessing import Lock, Process, RawValue
 from functools import partial
+from multiprocessing.sharedctypes import RawValue
 from PIL import Image
 from tqdm import tqdm
 from torchvision.transforms import functional as trans_fn
 import os
 from pathlib import Path
 import lmdb
+import numpy as np
+import time
 
 
 def resize_and_convert(img, size, resample):
@@ -35,7 +39,6 @@ def resize_multiple(img, sizes=(16, 128), resample=Image.BICUBIC, lmdb_save=Fals
 
  return [lr_img, hr_img, sr_img]
 
-
 def resize_worker(img_file, sizes, resample, lmdb_save=False):
  img = Image.open(img_file)
  img = img.convert('RGB')
@@ -44,6 +47,55 @@ def resize_worker(img_file, sizes, resample, lmdb_save=False):
 
  return img_file.name.split('.')[0], out
 
+class WorkingContext():
+ def __init__(self, resize_fn, lmdb_save, out_path, env, sizes):
+ self.resize_fn = resize_fn
+ self.lmdb_save = lmdb_save
+ self.out_path = out_path
+ self.env = env
+ self.sizes = sizes
+
+ self.counter = RawValue('i', 0)
+ self.counter_lock = Lock()
+
+ def inc_get(self):
+ with self.counter_lock:
+ self.counter.value += 1
+ return self.counter.value
+
+ def value(self):
+ with self.counter_lock:
+ return self.counter.value
+
+def prepare_process_worker(wctx, file_subset):
+ for file in file_subset:
+ i, imgs = wctx.resize_fn(file)
+ lr_img, hr_img, sr_img = imgs
+ if not wctx.lmdb_save:
+ lr_img.save(
+ '{}/lr_{}/{}.png'.format(wctx.out_path, wctx.sizes[0], i.zfill(5)))
+ hr_img.save(
+ '{}/hr_{}/{}.png'.format(wctx.out_path, wctx.sizes[1], i.zfill(5)))
+ sr_img.save(
+ '{}/sr_{}_{}/{}.png'.format(wctx.out_path, wctx.sizes[0], wctx.sizes[1], i.zfill(5)))
+ else:
+ with wctx.env.begin(write=True) as txn:
+ txn.put('lr_{}_{}'.format(
+ wctx.sizes[0], i.zfill(5)).encode('utf-8'), lr_img)
+ txn.put('hr_{}_{}'.format(
+ wctx.sizes[1], i.zfill(5)).encode('utf-8'), hr_img)
+ txn.put('sr_{}_{}_{}'.format(
+ wctx.sizes[0], wctx.sizes[1], i.zfill(5)).encode('utf-8'), sr_img)
+ curr_total = wctx.inc_get()
+ if wctx.lmdb_save:
+ with wctx.env.begin(write=True) as txn:
+ txn.put('length'.encode('utf-8'), str(curr_total).encode('utf-8'))
+
+def all_threads_inactive(worker_threads):
+ for thread in worker_threads:
+ if thread.is_alive():
+ return False
+ return True
 
 def prepare(img_path, out_path, n_worker, sizes=(16, 128), resample=Image.BICUBIC, lmdb_save=False):
  resize_fn = partial(resize_worker, sizes=sizes,
@@ -60,31 +112,29 @@ def prepare(img_path, out_path, n_worker, sizes=(16, 128), resample=Image.BICUBI
  else:
  env = lmdb.open(out_path, map_size=1024 ** 4, readahead=False)
 
- total = 0
- if n_worker>1:
- with multiprocessing.Pool(n_worker) as pool:
- for i, imgs in tqdm(pool.imap_unordered(resize_fn, files)):
- lr_img, hr_img, sr_img = imgs
- if not lmdb_save:
- lr_img.save(
- '{}/lr_{}/{}.png'.format(out_path, sizes[0], i.zfill(5)))
- hr_img.save(
- '{}/hr_{}/{}.png'.format(out_path, sizes[1], i.zfill(5)))
- sr_img.save(
- '{}/sr_{}_{}/{}.png'.format(out_path, sizes[0], sizes[1], i.zfill(5)))
- else:
- with env.begin(write=True) as txn:
- txn.put('lr_{}_{}'.format(
- sizes[0], i.zfill(5)).encode('utf-8'), lr_img)
- txn.put('hr_{}_{}'.format(
- sizes[1], i.zfill(5)).encode('utf-8'), hr_img)
- txn.put('sr_{}_{}_{}'.format(
- sizes[0], sizes[1], i.zfill(5)).encode('utf-8'), sr_img)
- total += 1
- if lmdb_save:
- with env.begin(write=True) as txn:
- txn.put('length'.encode('utf-8'), str(total).encode('utf-8'))
+ if n_worker > 1:
+ # prepare data subsets
+ multi_env = None
+ if lmdb_save:
+ multi_env = env
+
+ file_subsets = np.array_split(files, n_worker)
+ worker_threads = []
+ wctx = WorkingContext(resize_fn, lmdb_save, out_path, multi_env, sizes)
+
+ # start worker processes, monitor results
+ for i in range(n_worker):
+ proc = Process(target=prepare_process_worker, args=(wctx, file_subsets[i]))
+ proc.start()
+ worker_threads.append(proc)
+ 
+ total_count = str(len(files))
+ while not all_threads_inactive(worker_threads):
+ print("{}/{} images processed".format(wctx.value(), total_count))
+ time.sleep(0.1)
+
  else:
+ total = 0
  for file in tqdm(files):
  i, imgs = resize_fn(file)
  lr_img, hr_img, sr_img = imgs
@@ -111,12 +161,12 @@ def prepare(img_path, out_path, n_worker, sizes=(16, 128), resample=Image.BICUBI
 if __name__ == '__main__':
  parser = argparse.ArgumentParser()
  parser.add_argument('--path', '-p', type=str,
- default='{}/Dataset/celebahq_256'.format(Path.home()))
+ default='../dataset/bunchofinputimgsfolder')
  parser.add_argument('--out', '-o', type=str,
- default='./dataset/celebahq')
+ default='../dataset/celebahq')
 
- parser.add_argument('--size', type=str, default='16,128')
- parser.add_argument('--n_worker', type=int, default=1)
+ parser.add_argument('--size', type=str, default='64,512')
+ parser.add_argument('--n_worker', type=int, default=3)
  parser.add_argument('--resample', type=str, default='bicubic')
  # default save in png format
  parser.add_argument('--lmdb', '-l', action='store_true')