ai-forever
diff --git a/‎DPF/dataset_reader.py‎
Lines changed: 30 additions & 1 deletion b/‎DPF/dataset_reader.py‎
Lines changed: 30 additions & 1 deletion
diff --git a/‎DPF/filters/data_filter.py‎
Lines changed: 3 additions & 2 deletions b/‎DPF/filters/data_filter.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎DPF/filters/images/aesthetic_filter.py‎
Lines changed: 2 additions & 1 deletion b/‎DPF/filters/images/aesthetic_filter.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎DPF/filters/images/aesthetic_improved_filter.py‎
Lines changed: 2 additions & 1 deletion b/‎DPF/filters/images/aesthetic_improved_filter.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎DPF/filters/images/blip_captioning_filter.py‎
Lines changed: 9 additions & 2 deletions b/‎DPF/filters/images/blip_captioning_filter.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎DPF/filters/images/cliplabels_filter.py‎
Lines changed: 2 additions & 8 deletions b/‎DPF/filters/images/cliplabels_filter.py‎
Lines changed: 2 additions & 8 deletions
diff --git a/‎DPF/filters/images/hash_filters.py‎
Lines changed: 8 additions & 39 deletions b/‎DPF/filters/images/hash_filters.py‎
Lines changed: 8 additions & 39 deletions
diff --git a/‎DPF/filters/images/info_filter.py‎
Lines changed: 2 additions & 3 deletions b/‎DPF/filters/images/info_filter.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎DPF/filters/images/llava_captioning_filter.py‎
Lines changed: 4 additions & 3 deletions b/‎DPF/filters/images/llava_captioning_filter.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎DPF/filters/images/nsfw_filter.py‎
Lines changed: 2 additions & 1 deletion b/‎DPF/filters/images/nsfw_filter.py‎
Lines changed: 2 additions & 1 deletion
@@ -321,7 +321,7 @@ def from_config(
  config: DatasetConfig,
  **kwargs
  ) -> DatasetProcessor:
- """Creates DatasetConfig dataset
+ """Creates DatasetProcessor from config
 
  Parameters
  ----------
@@ -345,3 +345,32 @@ def from_config(
  raise ValueError(f"Unsupported config: {config}")
  return processor
 
+ def from_df(self, config: DatasetConfig, df: pd.DataFrame) -> DatasetProcessor:
+ """Creates DatasetProcessor from config and dataframe
+
+ Parameters
+ ----------
+ config: DatasetConfig
+ Config of DatasetConfig type
+ df: pd.DataFrame
+ Dataframe for DatasetProcessor.df
+
+ Returns
+ -------
+ DatasetProcessor
+ Instance of DatasetProcessor dataset
+ """
+ if isinstance(config, ShardsDatasetConfig):
+ processor_class = ShardsDatasetProcessor
+ elif isinstance(config, ShardedFilesDatasetConfig):
+ processor_class = ShardedFilesDatasetProcessor
+ elif isinstance(config, FilesDatasetConfig):
+ processor_class = FilesDatasetProcessor
+ else:
+ raise ValueError(f"Unsupported config: {config}")
+
+ return processor_class(
+ filesystem=self.filesystem,
+ config=config,
+ df=df
+ )
@@ -12,9 +12,10 @@ class DataFilter(ABC):
  Abstract class for all filters that use datalaaders.
  """
 
- def __init__(self, pbar: bool):
+ def __init__(self, pbar: bool, _pbar_position: int = 0):
  super().__init__()
  self.pbar = pbar
+ self.pbar_position = _pbar_position
 
  @property
  @abstractmethod
@@ -66,7 +67,7 @@ def run(self, dataset: Dataset) -> pd.DataFrame:
  dataloader = DataLoader(dataset, collate_fn=identical_collate_fn, **self.dataloader_kwargs)
  df_labels = self._generate_dict_from_schema()
 
- for batch in tqdm(dataloader, disable=not self.pbar):
+ for batch in tqdm(dataloader, disable=not self.pbar, position=self.pbar_position):
  # drop Nans
  batch_filtered = [b[1] for b in batch if b[0]]
  if len(batch_filtered) == 0:
 
@@ -60,8 +60,9 @@ def __init__(
  workers: int = 16,
  batch_size: int = 64,
  pbar: bool = True,
+ _pbar_position: int = 0
  ):
- super().__init__(pbar)
+ super().__init__(pbar, _pbar_position)
 
  self.num_workers = workers
  self.batch_size = batch_size
 
@@ -79,8 +79,9 @@ def __init__(
  workers: int = 16,
  batch_size: int = 64,
  pbar: bool = True,
+ _pbar_position: int = 0
  ):
- super().__init__(pbar)
+ super().__init__(pbar, _pbar_position)
 
  self.num_workers = workers
  self.batch_size = batch_size
 
@@ -16,8 +16,15 @@ class BLIPCaptioningFilter(ImageFilter):
  BLIPCaptioningFilter class
  """
 
- def __init__(self, workers=16, batch_size=64, device="cuda:0", pbar=True):
- super().__init__(pbar)
+ def __init__(
+ self,
+ workers: int = 16,
+ batch_size: int = 64,
+ device: str = "cuda:0",
+ pbar: bool = True,
+ _pbar_position: int = 0
+ ):
+ super().__init__(pbar, _pbar_position)
 
  self.num_workers = workers
  self.batch_size = batch_size
 
@@ -34,13 +34,6 @@ class CLIPLabelsFilter(ImageFilter):
  Batch size for model
  pbar: bool = True
  Flag for displaying progress bar
-
- Attributes
- ----------
- schema: List[str]
- List of columns to be added with this filter.
- dataloader_kwargs: dict:
- Parameters for dataloader (batch_size, num_workers, collate_fn, etc.)
  """
 
  def __init__(
@@ -53,8 +46,9 @@ def __init__(
  workers: int = 16,
  batch_size: int = 64,
  pbar: bool = True,
+ _pbar_position: int = 0
  ):
- super().__init__(pbar)
+ super().__init__(pbar, _pbar_position)
 
  if templates is None:
  templates = ["{}", "photo of a {}"]
 
@@ -35,9 +35,14 @@ class PHashFilter(ImageFilter):
  PHashFilter class
  """
 
- def __init__(self, sim_hash_size: int = 8, workers: int = 16, pbar: bool = True):
- super().__init__(pbar)
-
+ def __init__(
+ self,
+ sim_hash_size: int = 8,
+ workers: int = 16,
+ pbar: bool = True,
+ _pbar_position: int = 0
+ ):
+ super().__init__(pbar, _pbar_position)
  self.num_workers = workers
  self.sim_hash_size = sim_hash_size
 
@@ -68,39 +73,3 @@ def process_batch(self, batch) -> dict:
  df_batch_labels[f"image_phash_{self.sim_hash_size}"].extend(img_simhashes)
 
  return df_batch_labels
-
-
-class MD5Filter(ImageFilter):
- """
- MD5Filter class
- """
-
- def __init__(
- self,
- pbar: bool = True,
- workers: int = 16,
- ):
- super().__init__(pbar)
-
- self.num_workers = workers
-
- self.schema = ["image_path", "image_md5"]
- self.dataloader_kwargs = {
- "num_workers": self.num_workers,
- "batch_size": 1,
- "drop_last": False,
- }
-
- def preprocess(self, img_bytes: bytes, data: dict):
- image_path = data["image_path"]
- img_md5 = get_md5_hash(img_bytes)
- return image_path, img_md5
-
- def process_batch(self, batch) -> dict:
- df_batch_labels = self._generate_dict_from_schema()
-
- image_paths, img_md5s = list(zip(*batch))
- df_batch_labels["image_path"].extend(image_paths)
- df_batch_labels["image_md5"].extend(img_md5s)
-
- return df_batch_labels
@@ -51,9 +51,8 @@ class ImageInfoFilter(ImageFilter):
  ImageInfoFilter class
  """
 
- def __init__(self, workers: int = 16, pbar: bool = True):
- super().__init__(pbar)
-
+ def __init__(self, workers: int = 16, pbar: bool = True, _pbar_position: int = 0):
+ super().__init__(pbar, _pbar_position)
  self.num_workers = workers
 
  @property
 
@@ -28,10 +28,11 @@ def __init__(
  prompt: str = 'detailed-long', 
  workers: int = 16,
  batch_size: int = 16,
- device="cuda:0", 
- pbar=True
+ device: str = "cuda:0",
+ pbar: bool = True,
+ _pbar_position: int = 0
  ):
- super().__init__(pbar)
+ super().__init__(pbar, _pbar_position)
  self.batch_size = batch_size
  self.num_workers = workers
  self.device = device
 
@@ -74,8 +74,9 @@ def __init__(
  batch_size: int = 64,
  device: str = "cuda:0",
  pbar: bool = True,
+ _pbar_position: int = 0
  ):
- super().__init__(pbar)
+ super().__init__(pbar, _pbar_position)
 
  self.num_workers = workers
  self.batch_size = batch_size