Skip to content
123 changes: 123 additions & 0 deletions DPF/filters/images/complexity_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import os
from typing import Any
from urllib.request import urlretrieve

import numpy as np
import torch
from segment_anything import ( # type: ignore
SamAutomaticMaskGenerator,
sam_model_registry,
)

from DPF.utils import read_image_rgb_from_bytes

from ...types import ModalityToDataMapping
from .img_filter import ImageFilter

WEIGHTS_URL = {'vit_h': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth',
'vit_l': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth',
'vit_b': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth'}


class ComplexityFilter(ImageFilter):
"""
Image complexity filter based on SAM: https://github.com/facebookresearch/segment-anything

Parameters
----------
weights_folder: str
Folder where the weights will be stored
model_name: str = 'vit_h'
Model version to use: vit_h - huge, vit_l - large, vit_b - big
points_per_side: int = 32
Parameter that regulates granularity of automatic segmentation
batch_size: int = 1
Batch size during mask calculation for one image
device: str = "cuda:0"
Device to use
workers: int = 16
Number of processes to use for reading data and calculating flow scores
pbar: bool = True
Whether to use a progress bar
"""

def __init__(
self,
weights_folder: str,
model_name: str = 'vit_h',
points_per_side: int = 32,
workers: int = 16,
batch_size: int = 1,
device: str = "cuda:0",
pbar: bool = True,
_pbar_position: int = 0
):
super().__init__(pbar, _pbar_position)
self.num_workers = workers
self.batch_size = batch_size
self.device = device

self.model_name = model_name
self.weights_folder = weights_folder
self.points_per_side = points_per_side

# Download checkpoints
path_to_model = os.path.join(self.weights_folder, self.model_name + '.pth')
if not os.path.exists(path_to_model):
os.makedirs(self.weights_folder, exist_ok=True)
urlretrieve(WEIGHTS_URL[self.model_name], path_to_model)

sam = sam_model_registry[self.model_name](checkpoint=path_to_model)
sam = sam.to(torch.device(self.device))
self.mask_generator = SamAutomaticMaskGenerator(
sam, points_per_batch=batch_size,
points_per_side=points_per_side
)

@property
def result_columns(self) -> list[str]:
return ["complexity_num_segments", "complexity_max_segment_area", "complexity_mean_segment_area"]

@property
def dataloader_kwargs(self) -> dict[str, Any]:
return {
"num_workers": self.num_workers,
"batch_size": 1,
"drop_last": False,
}

def preprocess_data(
self,
modality2data: ModalityToDataMapping,
metadata: dict[str, Any]
) -> Any:
key = metadata[self.key_column]
pil_img = read_image_rgb_from_bytes(modality2data['image'])
img = np.array(pil_img)
return key, img

def process_batch(self, batch: list[Any]) -> dict[str, list[Any]]:
df_batch_labels = self._get_dict_from_schema()

for data in batch:
key, img = data
h, w = img.shape[:2]
hw = h * w
with torch.no_grad():
outputs = self.mask_generator.generate(img)
num_segments = len(outputs)
if num_segments > 0:
areas = [x['area'] for x in outputs]
bg_area = hw - np.sum(areas)
areas.append(bg_area)
max_area = np.max(areas) / hw
mean_area = np.mean(areas) / hw
else:
max_area = mean_area = 0

df_batch_labels["complexity_num_segments"].extend([num_segments])
df_batch_labels["complexity_max_segment_area"].extend([max_area])
df_batch_labels["complexity_mean_segment_area"].extend([mean_area])
df_batch_labels[self.key_column].extend([key])

return df_batch_labels
222 changes: 222 additions & 0 deletions DPF/filters/videos/cogvlm2_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
import re
from io import BytesIO
from typing import Any

import numpy as np
import torch
from decord import VideoReader, bridge
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

from DPF.types import ModalityToDataMapping

from .video_filter import VideoFilter

prompt_templates = {
'detailed_video': 'Describe this video and its style in a very detailed manner',
'short_video': 'Describe this video and its style briefly',
'1_sentance': "Describe this video very shortly in 1 sentence."
}
MODEL_PATH = "THUDM/cogvlm2-video-llama3-chat"
TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[
0] >= 8 else torch.float16

compiled_regexs = [
(re.compile(r'the video (also )?is '), ''),
(re.compile(r'the video (also )?features '), ''),
(re.compile(r'the video (also )?shows '), ''),
(re.compile(r'the video (also )?depicts '), ''),
(re.compile(r'the video (also )?showcases '), ''),
(re.compile(r'the video (also )?captures '), ''),
(re.compile(r'the video (also )?provides '), ''),
(re.compile(r'the video (also )?showcases '), ''),
(re.compile(r'throughout the video, '), ''),
]


def clean_with_regex(caption: str) -> str:
lower_caption = str(caption).lower().strip()
for re_compiled, replacement in compiled_regexs:
iterator = reversed(list(re_compiled.finditer(lower_caption)))
for match in iterator:
pos = list(match.span())
caption = caption[:pos[0]] + replacement + caption[pos[1]:]
lower_caption = str(caption).lower().strip()

if caption.count('-') > 2:
split_captions = []
for split_caption in caption.split():
if split_caption.count('-') > 2:
split_caption = re.sub(r'-', ' ', split_caption)
split_captions.append(split_caption)
caption = ' '.join(split_captions)

caption = caption.strip('—-:/+=|@#&*')

return caption.strip()


class CogVLM2Filter(VideoFilter):
"""
CogVLM2 inference class to get captions for auto-labeling videos.
More info about the model here: https://github.com/THUDM/CogVLM2

Parameters
----------
prompt: str = '1_sentance'
Prompt for the model.
quant: int = 16
Model quantization mode: 4, 8 or 16
num_frames: int = 24
Number of frames to sample from the video
device: str = "cuda:0"
Device to use
workers: int = 16
Number of processes to use for reading data and calculating flow scores
pbar: bool = True
Whether to use a progress bar
"""
def __init__(
self,
prompt: str = '1_sentance',
quant: int = 16,
num_frames: int = 24,
temperature: float = 0.05,
max_new_tokens: int = 1024,
device: str = "cuda:0",
workers: int = 16,
pbar: bool = True,
_pbar_position: int = 0
):
super().__init__(pbar, _pbar_position)
self.strategy = 'chat'
self.prompt = prompt
self.tokenizer = AutoTokenizer.from_pretrained(
MODEL_PATH,
trust_remote_code=True,
# padding_side="left"
)
self.num_frames = num_frames

if quant == 4:
self.model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
torch_dtype=TORCH_TYPE,
trust_remote_code=True,
quantization_config=BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=TORCH_TYPE,
),
low_cpu_mem_usage=True,
revision='ca14f13b05f5ead425188aae3e5e725bf4905cd1'
).eval()
elif quant == 8:
self.model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
torch_dtype=TORCH_TYPE,
trust_remote_code=True,
quantization_config=BitsAndBytesConfig(
load_in_8bit=True,
bnb_4bit_compute_dtype=TORCH_TYPE,
),
low_cpu_mem_usage=True,
revision='ca14f13b05f5ead425188aae3e5e725bf4905cd1'
).eval()
else:
self.model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
torch_dtype=TORCH_TYPE,
trust_remote_code=True,
revision='ca14f13b05f5ead425188aae3e5e725bf4905cd1'
).eval().to(device)

self.query = prompt_templates[prompt]

self.num_workers = workers
self.device = device

self.temperature = temperature
self.max_new_tokens = max_new_tokens

@property
def result_columns(self) -> list[str]:
return ["caption_cogvlm", "caption_cogvlm_clean"]

@property
def dataloader_kwargs(self) -> dict[str, Any]:
return {
"num_workers": self.num_workers,
"batch_size": 1,
"drop_last": False,
}

def preprocess_data(
self,
modality2data: ModalityToDataMapping,
metadata: dict[str, Any]
) -> Any:
key = metadata[self.key_column]
video_file = BytesIO(modality2data['video'])
loaded_video_file = self.load_video(video_file, strategy=self.strategy)
return key, loaded_video_file

def process_batch(self, batch: list[Any]) -> dict[str, list[Any]]:
df_batch_labels = self._get_dict_from_schema()

key, video = batch[0]
inputs = self.model.build_conversation_input_ids(
tokenizer=self.tokenizer,
query=self.query,
images=[video],
history=[],
template_version=self.strategy
)

inputs = {
'input_ids': inputs['input_ids'].unsqueeze(0).to(self.device),
'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to(self.device),
'attention_mask': inputs['attention_mask'].unsqueeze(0).to(self.device),
'images': [[inputs['images'][0].to(self.device).to(TORCH_TYPE)]],
}
gen_kwargs = {
"max_new_tokens": self.max_new_tokens,
"pad_token_id": 128002,
"top_k": 1,
"do_sample": True,
"top_p": 0.1,
"temperature": self.temperature,
}
with torch.no_grad():
outputs = self.model.generate(**inputs, **gen_kwargs)
outputs = outputs[:, inputs['input_ids'].shape[1]:]
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
response_clean = clean_with_regex(response)
df_batch_labels[self.schema[1]].extend([response])
df_batch_labels[self.schema[2]].extend([response_clean])
df_batch_labels[self.key_column].extend([key])
return df_batch_labels


def load_video(self, video_path: BytesIO, strategy: str = 'chat') -> torch.Tensor:
bridge.set_bridge('torch')
num_frames = self.num_frames

decord_vr = VideoReader(uri=video_path)
total_frames = len(decord_vr)
if strategy == 'base':
frame_id_list = np.linspace(0, total_frames - 1, num_frames, dtype=int)
elif strategy == 'chat':
timestamps = decord_vr.get_frame_timestamp(np.arange(total_frames))
timestamps = [i[0] for i in timestamps]
max_second = round(max(timestamps)) + 1
frame_id_list = [] # type: ignore
for second in range(max_second):
closest_num = min(timestamps, key=lambda x: abs(x - second))
index = timestamps.index(closest_num)
frame_id_list.append(index) # type: ignore
if len(frame_id_list) >= num_frames:
break
else:
frame_id_list = None
video_data: torch.Tensor = decord_vr.get_batch(frame_id_list)
video_data = video_data.permute(3, 0, 1, 2)
return video_data
Loading