ai-forever · boomb0om · Jul 24, 2024 · Jul 24, 2024 · Jul 24, 2024 · Jul 24, 2024
diff --git a/DPF/filters/images/complexity_filter.py b/DPF/filters/images/complexity_filter.py
@@ -0,0 +1,123 @@
+import os
+from typing import Any
+from urllib.request import urlretrieve
+
+import numpy as np
+import torch
+from segment_anything import ( # type: ignore
+ SamAutomaticMaskGenerator,
+ sam_model_registry,
+)
+
+from DPF.utils import read_image_rgb_from_bytes
+
+from ...types import ModalityToDataMapping
+from .img_filter import ImageFilter
+
+WEIGHTS_URL = {'vit_h': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth',
+ 'vit_l': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth',
+ 'vit_b': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth'}
+
+
+class ComplexityFilter(ImageFilter):
+ """
+ Image complexity filter based on SAM: https://github.com/facebookresearch/segment-anything
+
+ Parameters
+ ----------
+ weights_folder: str
+ Folder where the weights will be stored
+ model_name: str = 'vit_h'
+ Model version to use: vit_h - huge, vit_l - large, vit_b - big
+ points_per_side: int = 32
+ Parameter that regulates granularity of automatic segmentation
+ batch_size: int = 1
+ Batch size during mask calculation for one image
+ device: str = "cuda:0"
+ Device to use
+ workers: int = 16
+ Number of processes to use for reading data and calculating flow scores
+ pbar: bool = True
+ Whether to use a progress bar
+ """
+
+ def __init__(
+ self,
+ weights_folder: str,
+ model_name: str = 'vit_h',
+ points_per_side: int = 32,
+ workers: int = 16,
+ batch_size: int = 1,
+ device: str = "cuda:0",
+ pbar: bool = True,
+ _pbar_position: int = 0
+ ):
+ super().__init__(pbar, _pbar_position)
+ self.num_workers = workers
+ self.batch_size = batch_size
+ self.device = device
+
+ self.model_name = model_name
+ self.weights_folder = weights_folder
+ self.points_per_side = points_per_side
+
+ # Download checkpoints
+ path_to_model = os.path.join(self.weights_folder, self.model_name + '.pth')
+ if not os.path.exists(path_to_model):
+ os.makedirs(self.weights_folder, exist_ok=True)
+ urlretrieve(WEIGHTS_URL[self.model_name], path_to_model)
+
+ sam = sam_model_registry[self.model_name](checkpoint=path_to_model)
+ sam = sam.to(torch.device(self.device))
+ self.mask_generator = SamAutomaticMaskGenerator(
+ sam, points_per_batch=batch_size,
+ points_per_side=points_per_side
+ )
+
+ @property
+ def result_columns(self) -> list[str]:
+ return ["complexity_num_segments", "complexity_max_segment_area", "complexity_mean_segment_area"]
+
+ @property
+ def dataloader_kwargs(self) -> dict[str, Any]:
+ return {
+ "num_workers": self.num_workers,
+ "batch_size": 1,
+ "drop_last": False,
+ }
+
+ def preprocess_data(
+ self,
+ modality2data: ModalityToDataMapping,
+ metadata: dict[str, Any]
+ ) -> Any:
+ key = metadata[self.key_column]
+ pil_img = read_image_rgb_from_bytes(modality2data['image'])
+ img = np.array(pil_img)
+ return key, img
+
+ def process_batch(self, batch: list[Any]) -> dict[str, list[Any]]:
+ df_batch_labels = self._get_dict_from_schema()
+
+ for data in batch:
+ key, img = data
+ h, w = img.shape[:2]
+ hw = h * w
+ with torch.no_grad():
+ outputs = self.mask_generator.generate(img)
+ num_segments = len(outputs)
+ if num_segments > 0:
+ areas = [x['area'] for x in outputs]
+ bg_area = hw - np.sum(areas)
+ areas.append(bg_area)
+ max_area = np.max(areas) / hw
+ mean_area = np.mean(areas) / hw
+ else:
+ max_area = mean_area = 0
+
+ df_batch_labels["complexity_num_segments"].extend([num_segments])
+ df_batch_labels["complexity_max_segment_area"].extend([max_area])
+ df_batch_labels["complexity_mean_segment_area"].extend([mean_area])
+ df_batch_labels[self.key_column].extend([key])
+
+ return df_batch_labels
diff --git a/DPF/filters/videos/cogvlm2_filter.py b/DPF/filters/videos/cogvlm2_filter.py
@@ -0,0 +1,222 @@
+import re
+from io import BytesIO
+from typing import Any
+
+import numpy as np
+import torch
+from decord import VideoReader, bridge
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+from DPF.types import ModalityToDataMapping
+
+from .video_filter import VideoFilter
+
+prompt_templates = {
+ 'detailed_video': 'Describe this video and its style in a very detailed manner',
+ 'short_video': 'Describe this video and its style briefly',
+ '1_sentance': "Describe this video very shortly in 1 sentence."
+ }
+MODEL_PATH = "THUDM/cogvlm2-video-llama3-chat"
+TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[
+ 0] >= 8 else torch.float16
+
+compiled_regexs = [
+ (re.compile(r'the video (also )?is '), ''),
+ (re.compile(r'the video (also )?features '), ''),
+ (re.compile(r'the video (also )?shows '), ''),
+ (re.compile(r'the video (also )?depicts '), ''),
+ (re.compile(r'the video (also )?showcases '), ''),
+ (re.compile(r'the video (also )?captures '), ''),
+ (re.compile(r'the video (also )?provides '), ''),
+ (re.compile(r'the video (also )?showcases '), ''),
+ (re.compile(r'throughout the video, '), ''),
+]
+
+
+def clean_with_regex(caption: str) -> str:
+ lower_caption = str(caption).lower().strip()
+ for re_compiled, replacement in compiled_regexs:
+ iterator = reversed(list(re_compiled.finditer(lower_caption)))
+ for match in iterator:
+ pos = list(match.span())
+ caption = caption[:pos[0]] + replacement + caption[pos[1]:]
+ lower_caption = str(caption).lower().strip()
+
+ if caption.count('-') > 2:
+ split_captions = []
+ for split_caption in caption.split():
+ if split_caption.count('-') > 2:
+ split_caption = re.sub(r'-', ' ', split_caption)
+ split_captions.append(split_caption)
+ caption = ' '.join(split_captions)
+
+ caption = caption.strip('—-:/+=|@#&*')
+
+ return caption.strip()
+
+
+class CogVLM2Filter(VideoFilter):
+ """
+ CogVLM2 inference class to get captions for auto-labeling videos.
+ More info about the model here: https://github.com/THUDM/CogVLM2
+
+ Parameters
+ ----------
+ prompt: str = '1_sentance'
+ Prompt for the model.
+ quant: int = 16
+ Model quantization mode: 4, 8 or 16
+ num_frames: int = 24
+ Number of frames to sample from the video
+ device: str = "cuda:0"
+ Device to use
+ workers: int = 16
+ Number of processes to use for reading data and calculating flow scores
+ pbar: bool = True
+ Whether to use a progress bar
+ """
+ def __init__(
+ self,
+ prompt: str = '1_sentance',
+ quant: int = 16,
+ num_frames: int = 24,
+ temperature: float = 0.05,
+ max_new_tokens: int = 1024,
+ device: str = "cuda:0",
+ workers: int = 16,
+ pbar: bool = True,
+ _pbar_position: int = 0
+ ):
+ super().__init__(pbar, _pbar_position)
+ self.strategy = 'chat'
+ self.prompt = prompt
+ self.tokenizer = AutoTokenizer.from_pretrained(
+ MODEL_PATH,
+ trust_remote_code=True,
+ # padding_side="left"
+ )
+ self.num_frames = num_frames
+
+ if quant == 4:
+ self.model = AutoModelForCausalLM.from_pretrained(
+ MODEL_PATH,
+ torch_dtype=TORCH_TYPE,
+ trust_remote_code=True,
+ quantization_config=BitsAndBytesConfig(
+ load_in_4bit=True,
+ bnb_4bit_compute_dtype=TORCH_TYPE,
+ ),
+ low_cpu_mem_usage=True,
+ revision='ca14f13b05f5ead425188aae3e5e725bf4905cd1'
+ ).eval()
+ elif quant == 8:
+ self.model = AutoModelForCausalLM.from_pretrained(
+ MODEL_PATH,
+ torch_dtype=TORCH_TYPE,
+ trust_remote_code=True,
+ quantization_config=BitsAndBytesConfig(
+ load_in_8bit=True,
+ bnb_4bit_compute_dtype=TORCH_TYPE,
+ ),
+ low_cpu_mem_usage=True,
+ revision='ca14f13b05f5ead425188aae3e5e725bf4905cd1'
+ ).eval()
+ else:
+ self.model = AutoModelForCausalLM.from_pretrained(
+ MODEL_PATH,
+ torch_dtype=TORCH_TYPE,
+ trust_remote_code=True,
+ revision='ca14f13b05f5ead425188aae3e5e725bf4905cd1'
+ ).eval().to(device)
+
+ self.query = prompt_templates[prompt]
+
+ self.num_workers = workers
+ self.device = device
+
+ self.temperature = temperature
+ self.max_new_tokens = max_new_tokens
+
+ @property
+ def result_columns(self) -> list[str]:
+ return ["caption_cogvlm", "caption_cogvlm_clean"]
+
+ @property
+ def dataloader_kwargs(self) -> dict[str, Any]:
+ return {
+ "num_workers": self.num_workers,
+ "batch_size": 1,
+ "drop_last": False,
+ }
+
+ def preprocess_data(
+ self,
+ modality2data: ModalityToDataMapping,
+ metadata: dict[str, Any]
+ ) -> Any:
+ key = metadata[self.key_column]
+ video_file = BytesIO(modality2data['video'])
+ loaded_video_file = self.load_video(video_file, strategy=self.strategy)
+ return key, loaded_video_file
+
+ def process_batch(self, batch: list[Any]) -> dict[str, list[Any]]:
+ df_batch_labels = self._get_dict_from_schema()
+
+ key, video = batch[0]
+ inputs = self.model.build_conversation_input_ids(
+ tokenizer=self.tokenizer,
+ query=self.query,
+ images=[video],
+ history=[],
+ template_version=self.strategy
+ )
+
+ inputs = {
+ 'input_ids': inputs['input_ids'].unsqueeze(0).to(self.device),
+ 'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to(self.device),
+ 'attention_mask': inputs['attention_mask'].unsqueeze(0).to(self.device),
+ 'images': [[inputs['images'][0].to(self.device).to(TORCH_TYPE)]],
+ }
+ gen_kwargs = {
+ "max_new_tokens": self.max_new_tokens,
+ "pad_token_id": 128002,
+ "top_k": 1,
+ "do_sample": True,
+ "top_p": 0.1,
+ "temperature": self.temperature,
+ }
+ with torch.no_grad():
+ outputs = self.model.generate(**inputs, **gen_kwargs)
+ outputs = outputs[:, inputs['input_ids'].shape[1]:]
+ response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+ response_clean = clean_with_regex(response)
+ df_batch_labels[self.schema[1]].extend([response])
+ df_batch_labels[self.schema[2]].extend([response_clean])
+ df_batch_labels[self.key_column].extend([key])
+ return df_batch_labels
+
+
+ def load_video(self, video_path: BytesIO, strategy: str = 'chat') -> torch.Tensor:
+ bridge.set_bridge('torch')
+ num_frames = self.num_frames
+
+ decord_vr = VideoReader(uri=video_path)
+ total_frames = len(decord_vr)
+ if strategy == 'base':
+ frame_id_list = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+ elif strategy == 'chat':
+ timestamps = decord_vr.get_frame_timestamp(np.arange(total_frames))
+ timestamps = [i[0] for i in timestamps]
+ max_second = round(max(timestamps)) + 1
+ frame_id_list = [] # type: ignore
+ for second in range(max_second):
+ closest_num = min(timestamps, key=lambda x: abs(x - second))
+ index = timestamps.index(closest_num)
+ frame_id_list.append(index) # type: ignore
+ if len(frame_id_list) >= num_frames:
+ break
+ else:
+ frame_id_list = None
+ video_data: torch.Tensor = decord_vr.get_batch(frame_id_list)
+ video_data = video_data.permute(3, 0, 1, 2)
+ return video_data