Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
252 commits
Select commit Hold shift + click to select a range
453ac87
Update validator
boomb0om Oct 8, 2023
3b9080b
Add writers
boomb0om Oct 8, 2023
853fe23
Refactor filters
boomb0om Oct 9, 2023
f261540
Add dataframe filter in processor
boomb0om Oct 10, 2023
0793ce7
Refactor & rework text filters
boomb0om Oct 12, 2023
d3296f6
Change threads to processes
boomb0om Oct 13, 2023
4853c11
Small updates
boomb0om Oct 17, 2023
3e7b802
Merge branch 'dev' of https://github.com/ai-forever/DataProcessingFra…
boomb0om Oct 17, 2023
6471dba
Add random sample reading
boomb0om Oct 20, 2023
c74b480
Add files format
boomb0om Oct 20, 2023
b05e47c
Add llava captioning filter
boomb0om Oct 23, 2023
aae2d56
Upgrade llava filter
AleksandrTulenkov Oct 30, 2023
feb697a
add threads in processor methods
boomb0om Nov 3, 2023
846d4ab
Add batching in llava filter
boomb0om Nov 4, 2023
877bbfd
Add workers param in DatasetProcessor methods signature
boomb0om Nov 7, 2023
bfdd6a9
Add df validation during reading
boomb0om Nov 7, 2023
794ab7b
Upgrade processor
boomb0om Nov 7, 2023
c0ded96
fix df validation during reading
boomb0om Nov 7, 2023
65e5ba7
Refactor validators
boomb0om Nov 7, 2023
6d55346
fix types
Vitaly-Protasov Nov 28, 2023
b60118f
Fix typehinting
boomb0om Nov 28, 2023
849bd30
Merge pull request #21 from ai-forever/lysenko_type
boomb0om Nov 28, 2023
51119a0
Add filenaming in writers
boomb0om Nov 28, 2023
24f5970
Refactor configs
boomb0om Nov 28, 2023
1e735ac
Refactor configs & update tests
boomb0om Nov 28, 2023
7537670
Add docstrings
boomb0om Nov 28, 2023
66a51e9
Delete pylint workflow & add pytest workflow
boomb0om Nov 28, 2023
f5b6b5a
Fix pytest workflow
boomb0om Nov 28, 2023
4a2df49
Update requirements.txt
boomb0om Nov 28, 2023
6dec6e7
Update requirements.txt
boomb0om Nov 28, 2023
82d21f4
Update requirements.txt
boomb0om Nov 28, 2023
0a5205d
Remove old examples
boomb0om Nov 29, 2023
bf3e6e4
Add docstrings & update workflow
boomb0om Nov 29, 2023
ab3c268
Update README.md
boomb0om Nov 29, 2023
8ed30e9
Add 'files' format
boomb0om Dec 5, 2023
b51ed4c
Add new tests
boomb0om Dec 5, 2023
b21e46e
Refactor filters
boomb0om Dec 5, 2023
53c965d
Add video filters
boomb0om Dec 5, 2023
4cfca4a
Refactor collate_fn in filters
boomb0om Dec 5, 2023
822745b
Add image filter adapter for videos
boomb0om Dec 5, 2023
ed4c162
Fix bug
boomb0om Dec 5, 2023
25344c7
Update requirements
boomb0om Dec 5, 2023
9ee0015
Add more docstrings
boomb0om Dec 6, 2023
b0620bc
Update readme
boomb0om Dec 6, 2023
c925051
Add script for multigpu filtering
boomb0om Dec 8, 2023
30e51ae
Add summary
boomb0om Dec 8, 2023
a196270
Update multi gpu filtering
boomb0om Dec 13, 2023
b05c42f
Merge conflicts
boomb0om Dec 13, 2023
cff7ff5
Add new aesthetic filter
Vitaly-Protasov Dec 14, 2023
2659956
drop Lightning Module
Vitaly-Protasov Dec 15, 2023
ccc6c46
Merge pull request #22 from ai-forever/lysenko_aesthetic
boomb0om Dec 15, 2023
9182c60
Update llava filter & multigpu script
boomb0om Dec 15, 2023
08cb1c5
Merge branch 'dev' of https://github.com/ai-forever/DataProcessingFra…
boomb0om Dec 15, 2023
19e97ac
Fix small bugs & refactor
boomb0om Dec 15, 2023
8db2ccb
Add tests for writers
boomb0om Dec 18, 2023
ae34792
Fix df error in multigpu filtering
boomb0om Dec 18, 2023
ab5dde4
add blip filter
Vitaly-Protasov Dec 18, 2023
3b90a82
Fix blip2 similarity filter
boomb0om Dec 18, 2023
1e74c77
Add error handling in filters
boomb0om Dec 19, 2023
236fae7
Merge branch 'dev' of https://github.com/ai-forever/DataProcessingFra…
boomb0om Dec 19, 2023
a919a65
Merge pull request #24 from ai-forever/lysenko_blip_filter
boomb0om Dec 19, 2023
4e98358
Merge branch 'dev' of https://github.com/ai-forever/DataProcessingFra…
boomb0om Dec 19, 2023
df3027c
Initial review comment
pablopinta Dec 20, 2023
72a9d8c
Fix dataloader in converting
boomb0om Dec 20, 2023
64c5316
Add review coments
pablopinta Dec 20, 2023
1fbbeda
Add review comments
pablopinta Dec 20, 2023
a6ff36f
Update llava captioning filter
boomb0om Dec 22, 2023
81a27d9
Update readme.md
boomb0om Dec 22, 2023
93412fa
qMerge branch 'dev' of https://github.com/ai-forever/DataProcessingFr…
boomb0om Dec 22, 2023
58a0b22
Merge branch 'dev' into dev-review
boomb0om Dec 22, 2023
b923591
refactor: refactor configs
boomb0om Feb 9, 2024
37c61dc
refactor: refactor video info filter
boomb0om Feb 9, 2024
8d6fbef
refactor: refactor dataloaders
boomb0om Feb 9, 2024
b8c5161
refactor: refactor imports
boomb0om Feb 9, 2024
f1bc021
feat: add Video-LLaVA captioner
Feb 13, 2024
97bda41
refactor: update inference code for the Video-LLaVA model
Feb 13, 2024
ae340ac
fix: change column names
Feb 16, 2024
8f6e8fe
refactor: rename dataloaders utils
boomb0om Feb 17, 2024
d863402
test: add tests for filters
boomb0om Feb 17, 2024
d612458
fix: use raw strings for regexs
boomb0om Feb 17, 2024
825a60d
test: add tests for column filters
boomb0om Feb 17, 2024
468f599
refactor: refactor column filters
boomb0om Feb 17, 2024
46e019e
refactor: refactor schema in filters
boomb0om Feb 17, 2024
9766fa3
refactor: rename regexs lists
boomb0om Feb 17, 2024
8401160
fix: fix path column renaming in Files dataset
boomb0om Feb 18, 2024
a152ce9
Merge pull request #28 from ai-forever/fix-column-name
boomb0om Feb 18, 2024
a73a6e7
feat: add google translate filter
boomb0om Feb 18, 2024
00ce3d1
fix: add scipy requirement
boomb0om Feb 18, 2024
9fd0f80
Merge pull request #29 from ai-forever/translator-filter
boomb0om Feb 18, 2024
6b346db
refactor: refactor regexs init in RegexFilter
boomb0om Feb 18, 2024
c77aa9c
Merge pull request #27 from ai-forever/kirillova/video_llava
boomb0om Feb 19, 2024
10ba2e7
fix: fix typo errors in video llava filter
boomb0om Feb 21, 2024
d759c45
feat: add the initial version of the RAFT Optical Flow model
Feb 22, 2024
d0cd62e
fix: add resize of initial frames
Feb 26, 2024
d0ae6c0
fix: remove errors with imports and and non-existent variables
Feb 26, 2024
8a371a6
fix: fix the errors, clean the code base of the filter
Feb 26, 2024
3a8cea9
fix: fix error with index out of range
Feb 26, 2024
53caf91
fix: edit formatting of output captions
Feb 26, 2024
78ef237
feat: add Farnrback optical flow filter impelementation
Feb 26, 2024
f6fb53d
fix: remove print
Feb 26, 2024
5bebc01
refactor: update docstring and add new vars for main functions
Feb 26, 2024
1a7c049
refactor: update docstrings
Feb 26, 2024
69db044
fix: remove the errors with imports
Feb 27, 2024
8120fd7
refactor: edit the code
Feb 27, 2024
667c372
fix: add resize for vertical and square videos
Feb 27, 2024
9014455
feat: add base transforms and video & image resizing transforms
boomb0om Feb 27, 2024
3e64b05
test: add tests for resizer
boomb0om Feb 27, 2024
3391d8b
refactor: rename max_workers to workers
boomb0om Feb 27, 2024
5cb6022
docs: update README with transforms info
boomb0om Feb 27, 2024
4a9854f
fix: add pytest requirement
boomb0om Feb 28, 2024
aae5306
feat: make transforms update metadata
boomb0om Feb 28, 2024
2b3ce88
fix: move frame transforms to dataloader and add param to pass n frames
Feb 28, 2024
55a280a
update python version in workflow
boomb0om Feb 28, 2024
05a084d
Merge pull request #32 from ai-forever/pavlov/transforms
boomb0om Feb 28, 2024
c8b5332
merge: fix merge conflicts
boomb0om Feb 29, 2024
d3fa972
refactor: change new filters to new format
boomb0om Feb 29, 2024
e69ba99
fix: move frame transforms to dataloader and add param to pass n frames
Feb 29, 2024
fda6c25
Merge pull request #25 from ai-forever/dev-review
boomb0om Feb 29, 2024
a6433a6
fix: add downloading the model weights
Feb 29, 2024
9f35716
Merge pull request #31 from ai-forever/kirillova/raft_optical_flow
boomb0om Feb 29, 2024
8a20566
Merge pull request #30 from ai-forever/kirillova/farneback_optical_flow
boomb0om Feb 29, 2024
fabc2c2
refactor: change optical flow filters to new format
boomb0om Feb 29, 2024
c7ca27e
refactor: change video_llava prompt templates
boomb0om Feb 29, 2024
680232d
fix: add pass_frames to preprocessing
Mar 1, 2024
fef0633
fix: add pass_frames to preprocessing and remove DataParallel for mod…
Mar 1, 2024
424d160
Merge pull request #33 from ai-forever/kirillova/optical_flow_filters
boomb0om Mar 1, 2024
7eaac2e
fix: speed up video llava & fix invalid device
boomb0om Mar 1, 2024
04eb9f6
Merge branch 'dev' of https://github.com/ai-forever/DataProcessingFra…
boomb0om Mar 1, 2024
d8ceffe
feat: add pbar position arg to all filters
boomb0om Mar 3, 2024
38861b2
feat: add multi-gpu filter
boomb0om Mar 3, 2024
c0c75fc
refactor: change filter params arg
boomb0om Mar 3, 2024
d3cfd19
docs: add multigpu exampel
boomb0om Mar 3, 2024
44af029
refactor: change github action for test and add code style check
boomb0om Mar 3, 2024
1534567
feat: add pre-commit linting
boomb0om Mar 3, 2024
a82f3cc
refactor: refactor imports
boomb0om Mar 3, 2024
3d7020d
test pre-commit
boomb0om Mar 3, 2024
4a61500
refactor: change code-quality script & delete pre-commit
boomb0om Mar 3, 2024
a0ce5e1
feat: configure ruff linter
boomb0om Mar 4, 2024
2be174b
refactor: fix ruff code style
boomb0om Mar 4, 2024
8a0ef8b
feat: update isort config
boomb0om Mar 4, 2024
141e9f7
refactor: isort imports
boomb0om Mar 4, 2024
a912743
refactor: fix mypy errors
boomb0om Mar 5, 2024
535690d
fix: fix errors after update
boomb0om Mar 5, 2024
4beb9c0
refactor: refactor configs & dataset reader
boomb0om Mar 5, 2024
24ab586
refactor: refactor dataloaders
boomb0om Mar 5, 2024
300a661
feat: update ruff config
boomb0om Mar 5, 2024
c720b85
feat: update mypy settings
boomb0om Mar 5, 2024
6564c90
refactor: fix ruff linting & errors
boomb0om Mar 5, 2024
5d100ac
Merge pull request #34 from ai-forever/multi-gpu-filter
boomb0om Mar 5, 2024
f6b1632
fix merge conflicts
boomb0om Mar 5, 2024
b2980e1
fix: add checker for captions
Mar 6, 2024
6d42533
Merge pull request #35 from ai-forever/kirillova/fix_caption_duplicates
boomb0om Mar 6, 2024
325bf13
refactor: refactor configs & dataset reader
boomb0om Mar 6, 2024
c86d1e1
refactor: refactor dataloaders
boomb0om Mar 6, 2024
18ef2e7
refactor: refactor modality types
boomb0om Mar 7, 2024
881d6a7
refactor: refactor transforms
boomb0om Mar 9, 2024
475beac
refactor: refactor writers & processor utils
boomb0om Mar 9, 2024
e364952
fix: fix resizer errors
boomb0om Mar 9, 2024
453956f
refactor: refactor base filter classes
boomb0om Mar 9, 2024
6701cd9
refactor: refactor filters
boomb0om Mar 9, 2024
cf1dd6d
fix: update pyproject and requirements for filters
boomb0om Mar 9, 2024
9c7ca74
refactor: refactor validators
boomb0om Mar 9, 2024
96838c2
refactor: fix all mypy errors
boomb0om Mar 9, 2024
b206313
change: update pyproject.toml
boomb0om Mar 9, 2024
2688352
change: change installator
boomb0om Mar 9, 2024
70ebdd2
change: update workflow
boomb0om Mar 9, 2024
66c6f1b
fix req
boomb0om Mar 9, 2024
b334f51
change: update workflow
boomb0om Mar 9, 2024
d861e9e
fix req
boomb0om Mar 9, 2024
049addc
change: update workflow
boomb0om Mar 9, 2024
4d4092d
fix req
boomb0om Mar 9, 2024
bfd264b
fix: fix errors
boomb0om Mar 9, 2024
d7710f0
fix: fix errors
boomb0om Mar 9, 2024
f2eb479
refactor: upgrade type annotations
boomb0om Mar 10, 2024
5e4ec56
change: update workflow
boomb0om Mar 10, 2024
2a5bdc7
docs: add docstrings
boomb0om Mar 11, 2024
480b0ea
refactor: refactor filesystems
boomb0om Mar 12, 2024
a231705
fix: fix multigpu filter cuda error
boomb0om Mar 12, 2024
7117fb4
fix: remove unused import
boomb0om Mar 12, 2024
ae5e23d
docs: update documentation
boomb0om Mar 13, 2024
c32e0ba
feat: change fsspec to fsconnectors in s3connector
boomb0om Mar 13, 2024
f8db0c1
fix: fix errors in docs
boomb0om Mar 14, 2024
41cf6a7
change: add push event on pytest workflow
boomb0om Mar 14, 2024
1b694fe
refactor: fix merge conflicts
boomb0om Mar 14, 2024
15cc19a
change: remove pull-request event in workflow
boomb0om Mar 14, 2024
f5f847d
Merge pull request #36 from ai-forever/code-style-checks
boomb0om Mar 14, 2024
71f4003
docs: translated to eng
boomb0om Mar 18, 2024
3166cff
docs: add more docs and examples
boomb0om Mar 19, 2024
23db1fe
docs: update documentation
boomb0om Mar 20, 2024
c79d77d
fix: fix issues #37 & #38
boomb0om Mar 22, 2024
b5d67c9
fix: remove errors with captions
NastyaMittseva Mar 26, 2024
01a8417
fix: remove the 2nd append
NastyaMittseva Mar 26, 2024
f8a0ad4
fix: change return value of function
gofixyourself Mar 28, 2024
6998a3e
Merge pull request #40 from ai-forever/kirillova/video_llava_fix_capt…
boomb0om Mar 28, 2024
00615e9
fix: fix readme paths and add example video dataset
boomb0om Mar 29, 2024
25a59ce
change: update requirements and change videollava cache_dir
boomb0om Mar 29, 2024
bb0ad6c
feat: add video filters example
boomb0om Mar 30, 2024
27eaccb
change: make better filters schema
boomb0om Mar 30, 2024
2920871
Merge branch 'dev' of https://github.com/ai-forever/DataProcessingFra…
boomb0om Mar 30, 2024
09e4ee0
feat: add filters examples
boomb0om Mar 30, 2024
1cfeabe
refactor: small refactor translator and lang filters
boomb0om Mar 30, 2024
dc81dac
docs: add examples in docs
boomb0om Mar 30, 2024
7dbbf81
chore: corrected version of llama
LysenkoAnastasia Apr 1, 2024
7af1e3c
docs: update filters documentation
boomb0om Apr 8, 2024
85125dc
fix: fix typing mismatches
boomb0om Apr 9, 2024
602e48c
docs: update docs
boomb0om Apr 9, 2024
f5ca4ff
feat: add new example
boomb0om Apr 9, 2024
0125cbe
Merge pull request #41 from ai-forever/v1.0
boomb0om Apr 9, 2024
097258b
fix: update the default temperature for Video-LLaVA
gofixyourself Apr 9, 2024
b652cc5
refactor: update the code
gofixyourself Apr 9, 2024
14ec239
fix: upgrade the code in optical flow filters
gofixyourself Apr 9, 2024
f313387
refactor: fix the pep8 warnings
gofixyourself Apr 9, 2024
58b026a
refactor: add the type annotation
gofixyourself Apr 9, 2024
9aa9166
refactor: fix the types of variables
gofixyourself Apr 9, 2024
87c02da
refactor: add numpy type for var
gofixyourself Apr 9, 2024
988f8fc
refactor: update type annotation
gofixyourself Apr 9, 2024
5bfd0d4
refactor: fix pep8 warnings and errors
gofixyourself Apr 9, 2024
cdacfc2
refactor: fix pep8 warnings and errors
gofixyourself Apr 9, 2024
f16eba7
Merge pull request #43 from ai-forever/kirillova/video_llava_hyperparam
boomb0om Apr 11, 2024
4d06df9
Merge pull request #44 from ai-forever/kirillova/optical_flow_ups
boomb0om Apr 11, 2024
93e1400
feat: add LITA video captioning filter
gofixyourself Apr 11, 2024
785b0c3
fix: update project imports
gofixyourself Apr 11, 2024
975435c
fix: add ignoring errors
gofixyourself Apr 11, 2024
9051066
fix: add ignoring errors
gofixyourself Apr 11, 2024
9719629
fix: pep8 warnings
gofixyourself Apr 11, 2024
68cd4f1
feat: add examples and docs for LITA
boomb0om Apr 12, 2024
488a46f
Merge pull request #45 from ai-forever/kirillova/lita_video_captioner
boomb0om Apr 12, 2024
61cc8e6
feat: add filter pipelines
boomb0om Apr 15, 2024
d594b5c
docs: add pipelines documentation
boomb0om Apr 15, 2024
ece0838
fix: try to change import
boomb0om Apr 15, 2024
6cbd797
fix: remove pipelines from .gitignore
boomb0om Apr 15, 2024
c9e9552
fix: linter fixes
boomb0om Apr 15, 2024
977a232
fix: fix error with multiprocessing in multigpu filter
boomb0om Apr 16, 2024
271689e
docs: add pipelines example notebook
boomb0om Apr 16, 2024
b0682ef
Merge pull request #46 from ai-forever/v1.0
boomb0om Apr 16, 2024
25900cc
fix: change device choice in model loader function
gofixyourself Apr 19, 2024
45579ab
fix: remove linter errors
gofixyourself Apr 19, 2024
bcd6b7e
fix: remove linter errors
gofixyourself Apr 19, 2024
d84db51
fix: add exceptions for untyped functions
gofixyourself Apr 19, 2024
9758156
fix: remove linter errors
gofixyourself Apr 19, 2024
49cd1e9
fix: change imports order
gofixyourself Apr 19, 2024
ffae303
feat: add video fps tranform
boomb0om May 5, 2024
23de547
feat: add transforms in pipelines
boomb0om May 6, 2024
2e3c5f0
Merge pull request #49 from ai-forever/new-video-pipe
boomb0om May 6, 2024
03e20c3
fix: fix fps meta changing
boomb0om May 7, 2024
75f2d09
change: refactor video transforms
boomb0om May 8, 2024
2d4f74e
feat: add videos cutting
boomb0om May 13, 2024
99f6c6f
docs: update doc
boomb0om May 13, 2024
4486ab7
Merge pull request #50 from ai-forever/video-cut
boomb0om May 13, 2024
1822b22
Merge pull request #47 from ai-forever/kirillova/lita_multigpu_fix
boomb0om May 14, 2024
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
feat: add filter pipelines
  • Loading branch information
boomb0om committed Apr 15, 2024
commit 61cc8e6f73d97fde6aa46bdfd3bc7ec68bde7f5a
53 changes: 53 additions & 0 deletions DPF/filters/images/dummy_gpu_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from typing import Any, Union

import torch

from DPF.types import ModalityToDataMapping

from .img_filter import ImageFilter


class DummyGPUFilter(ImageFilter):
"""
DummyGPUFilter class for testing purposes
"""

def __init__(
self,
workers: int = 16,
device: Union[str, torch.device] = "cuda",
pbar: bool = True,
_pbar_position: int = 0
):
super().__init__(pbar, _pbar_position)
self.num_workers = workers
self.device = device

@property
def result_columns(self) -> list[str]:
return ["dummy_label",]

@property
def dataloader_kwargs(self) -> dict[str, Any]:
return {
"num_workers": self.num_workers,
"batch_size": 1,
"drop_last": False,
}

def preprocess_data(
self,
modality2data: ModalityToDataMapping,
metadata: dict[str, Any]
) -> Any:
key = metadata[self.key_column]
return key, 1

def process_batch(self, batch: list[Any]) -> dict[str, list[Any]]:
df_batch_labels = self._get_dict_from_schema()

keys, dummy_labels = list(zip(*batch))
df_batch_labels[self.key_column].extend(keys)
df_batch_labels[self.result_columns[0]].extend(dummy_labels)

return df_batch_labels
10 changes: 10 additions & 0 deletions DPF/filters/multigpu_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,16 @@ def __init__(
self.devices = devices
self.num_parts = len(devices)

# getting result columns names
datafilter = self.filter_class(**self.filter_params, device=devices[0]) # type: ignore
self._result_columns = datafilter.result_columns
del datafilter
torch.cuda.empty_cache()

@property
def result_columns(self) -> list[str]:
return self._result_columns

def run(
self,
df: pd.DataFrame,
Expand Down
1 change: 1 addition & 0 deletions DPF/pipelines/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .filter_pipeline import FilterPipeline
6 changes: 5 additions & 1 deletion DPF/processors/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ def columns(self) -> list[str]:
"""Columns that presented in dataframe"""
return self._df.columns.tolist() # type: ignore

@property
def modalities(self) -> list[str]:
return list(self.config.modality2datatype.keys())

def __getitem__(self, column_name: str) -> pd.Series:
return self._df[column_name]

Expand All @@ -63,7 +67,7 @@ def print_summary(self) -> None:
"""Prints summary info about dataset"""
print('Dataset format:', config2format(self.config))
print('Path:', self.config.path)
print('Modalities:', list(self.config.modality2datatype.keys()))
print('Modalities:', self.modalities)

cols = self.columns
print('Columns:', len(cols))
Expand Down
36 changes: 31 additions & 5 deletions DPF/utils/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
import logging.config
import os
import sys
from typing import Any

LOGGERS_CONFIG = {
LOGGERS_CONFIG: dict[str, Any] = {
"version": 1,
"disable_existing_loggers": False,
"formatters": {"default": {"format": "[%(asctime)s][%(levelname)s]: %(message)s"}},
Expand All @@ -27,17 +28,42 @@
}


def get_logging_config_copy() -> dict[str, Any]:
config = LOGGERS_CONFIG.copy()
config["loggers"] = LOGGERS_CONFIG["loggers"].copy()
config["handlers"] = LOGGERS_CONFIG["handlers"].copy()
config["formatters"] = LOGGERS_CONFIG["formatters"].copy()
return config


def init_logger(filename: str, logger_name: str = "filter_logger", logging_dir: str = "./logs/") -> logging.Logger:
os.makedirs(logging_dir, exist_ok=True)

LOGGERS_CONFIG["handlers"]["file"]["filename"] = os.path.join(logging_dir, filename) # type: ignore [index]
LOGGERS_CONFIG["loggers"][logger_name] = { # type: ignore [index]
config = get_logging_config_copy()

config["handlers"]["file"]["filename"] = os.path.join(logging_dir, filename)
config["loggers"][logger_name] = {
"handlers": ["console", "file"],
"level": logging.DEBUG,
}

logging.config.dictConfig(LOGGERS_CONFIG)
logging.config.dictConfig(config)
logger = logging.getLogger(logger_name)

return logger


def init_stdout_logger(logger_name: str = "filter_logger") -> logging.Logger:
config = get_logging_config_copy()

config["loggers"][logger_name] = {
"handlers": ["console"],
"level": logging.DEBUG,
}
config["handlers"].pop("file")
config["loggers"].pop("template")

logging.config.dictConfig(config)
logger = logging.getLogger(logger_name)
logger.info("Logger initialized")

return logger
118 changes: 118 additions & 0 deletions tests/test_pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
from DPF import DatasetReader
from DPF.configs import ShardsDatasetConfig
from DPF.filters.images.dummy_gpu_filter import DummyGPUFilter
from DPF.filters.images.hash_filters import PHashFilter
from DPF.filters.images.info_filter import ImageInfoFilter
from DPF.pipelines import FilterPipeline


def test_pipeline_imageinfo():
path = 'tests/datasets/shards_correct'
config = ShardsDatasetConfig.from_path_and_columns(
path,
image_name_col="image_name",
text_col="caption"
)

reader = DatasetReader()
processor = reader.read_from_config(config)

pipeline = FilterPipeline("test_pipeline_imageinfo")
pipeline.add_datafilter(
ImageInfoFilter,
{'workers': 1},
)
pipeline.run(processor)

assert len({'width', 'height', 'channels'}.intersection(set(processor.df.columns))) == 3


def test_pipeline_imageinfo_bad_1():
path = 'tests/datasets/shards_bad_image'
config = ShardsDatasetConfig.from_path_and_columns(
path,
image_name_col="image_name",
text_col="caption"
)

reader = DatasetReader()
processor = reader.read_from_config(config)

pipeline = FilterPipeline("test_pipeline_imageinfo_bad")
pipeline.add_datafilter(
ImageInfoFilter,
{'workers': 1},
on_error="continue"
)
pipeline.run(processor)

assert len({'width', 'height', 'channels'}.intersection(set(processor.df.columns))) == 0


def test_pipeline_imageinfo_bad_2():
path = 'tests/datasets/shards_bad_image'
config = ShardsDatasetConfig.from_path_and_columns(
path,
image_name_col="image_name",
text_col="caption"
)

reader = DatasetReader()
processor = reader.read_from_config(config)

pipeline = FilterPipeline("test_pipeline_imageinfo_bad")
pipeline.add_datafilter(
ImageInfoFilter,
{'workers': 1},
on_error="stop"
)
error = None
try:
pipeline.run(processor)
except Exception as err:
error = err
assert error is not None


def test_pipeline_phash_dedup():
path = 'tests/datasets/shards_correct'
config = ShardsDatasetConfig.from_path_and_columns(
path,
image_name_col="image_name",
text_col="caption"
)

reader = DatasetReader()
processor = reader.read_from_config(config)

pipeline = FilterPipeline("test_pipeline_phash_dedup")
pipeline.add_datafilter(
PHashFilter,
{'workers': 1},
)
pipeline.add_deduplication(['image_phash_8'])
pipeline.run(processor)

assert len(processor.df) == 1


def test_pipeline_multigpu():
path = 'tests/datasets/shards_correct'
config = ShardsDatasetConfig.from_path_and_columns(
path,
image_name_col="image_name",
text_col="caption"
)

reader = DatasetReader()
processor = reader.read_from_config(config)

pipeline = FilterPipeline("test_pipeline_phash_dedup")
pipeline.add_datafilter(
DummyGPUFilter,
{'workers': 1},
devices=["cuda:0", "cuda:1"]
)
pipeline.run(processor)

assert len(processor.df) == 2 and 'dummy_label' in processor.columns