modelscope · SYSUzhouting · Jan 16, 2025 · Mar 9, 2025 · Mar 9, 2025 · Mar 20, 2025
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -513,7 +513,56 @@ process:
       mem_required: '9GB'
   - whitespace_normalization_mapper:                        # normalize different kinds of whitespaces to English whitespace.
 
+
+# When use HumanVBench mapper, keep_stats_in_res_ds should be set true
+
+  - video_human_tracks_extraction_mapper:                   # Get the body and face trajectory bounding box of people in one shot of the video. To ensure correctness, it should be applied after video_split_by_scene_mapper
+      face_track_bbox_path: /home/daoyuan_mm/data-juicer/tmptreciept/tpt        # The storage location of the bounding box tracks of the characters in the video
+      YOLOv8_human_model_path: ./thirdparty/humanvbench_models/YOLOv8_human/weights/best.pt
+      mem_required: '10GB'
+
+  - video_humantrack_face_demographic_mapper:                   # Get the facial demographics of each person based on the results of video_human_tracks_extraction_mapper
+      original_data_save_path: /home/daoyuan_mm/data-juicer/tmptreciept/tpt2                # The location where the specific results of each frame's detection are stored
+      detect_interval: 5
+
+  - video_audio_attribute_mapper:                         # If the audio is speech, classify the gender and age of the speech
+      hf_audio_mapper: '/mnt/daoyuan_open_research/zt_data/pt_model/wav2vec2-large-robust-24-ft-age-gender'      # Huggingface model name for speech age and gender classification
+      mem_required: '7GB' 
+
+  - video_captioning_from_human_tracks_mapper:                # Based on the results of video_human_tracks_extraction_mapper, focus on the single person in the video for captioning
+      video_describe_model_path: /mnt/daoyuan_open_research/zt_data/pt_model/videollm/VideoLLaMA3-7B           # model path to sharegpt4video-8b
+      trust_remote_code: true
+      tempt_video_path: /home/daoyuan_mm/data-juicer/tmptreciept/tpt2           # Used to store temporary videos that will be removed finally.
+      mem_required: '40GB'     
+
+  - video_captioning_face_attribute_emotion_mapper:                   # Based on the results of video_human_tracks_extraction_mapper, focus on judging the gender, age, and race of a single person in the video
+      face_track_query: Please only describe the appearance and facial emotions of the person in the video in detail. Don't mention the background. Less than 80 words.
+      trust_remote_code: true
+      cropping_face_video_tempt_path: /home/daoyuan_mm/data-juicer/tmptreciept/tpt2            # Used to store temporary videos
+      video_describe_model_path: /mnt/daoyuan_open_research/zt_data/pt_model/videollm/VideoLLaMA3-7B          # Huggingface model DAMO-NLP-SG/VideoLLaMA2-7B-16F
+      mem_required: '40GB' 
+
+  - video_active_speaker_mapper:                          # Based on the results of video_human_tracks_extraction_mapper, determine whether each person is an active speaker
+      tempt_save_path: /home/daoyuan_mm/data-juicer/tmptreciept/tpt2          # Used to store temporary videos
+      Light_ASD_model_path: /home/daoyuan_mm/data-juicer/thirdparty/humanvbench_models/Light-ASD/weight/finetuning_TalkSet.model
+      acitve_threshold: 15
+      mem_required: '10GB' 
+
+
+  - video_audio_speech_ASR_mapper:                          # Automatic speech recognition from video speech
+      model_dir_ASR: '/mnt/daoyuan_open_research/zt_data/pt_model/SenseVoiceSmall'                     # Huggingface model FunAudioLLM/SenseVoiceSmall
+      mem_required: '20GB' 
+
+  - video_audio_speech_emotion_mapper:                          # Speech emotion recognition from video speech
+      model_dir_emo: '/mnt/daoyuan_open_research/zt_data/pt_model/SenseVoiceSmall'                         # # Huggingface model FunAudioLLM/SenseVoiceSmall
+      mem_required: '20GB' 
+
   # Filter ops
+  - video_face_ratio_filter:                                # Filter to retain human-centric videos
+      threshold: 0.65                                   # The lower limit of the ratio of frames with faces to the total number of video frames
+      detect_interval: 4
+      any_or_all: any 
+
   - alphanumeric_filter:                                    # filter text with alphabet/numeric ratio out of specific range.
       tokenization: false                                     # whether to count the ratio of alphanumeric to the total number of tokens.
       min_ratio: 0.0                                          # the min ratio of filter range

diff --git a/data_juicer/ops/filter/__init__.py b/data_juicer/ops/filter/__init__.py
@@ -43,6 +43,7 @@
 from .video_watermark_filter import VideoWatermarkFilter
 from .word_repetition_filter import WordRepetitionFilter
 from .words_num_filter import WordsNumFilter
+from .video_face_ratio_filter import VideoFaceRatioFilter
 
 __all__ = [
     'AlphanumericFilter', 'AudioDurationFilter', 'AudioNMFSNRFilter',
@@ -61,7 +62,7 @@
     'VideoMotionScoreFilter', 'VideoMotionScoreRaftFilter', 'VideoNSFWFilter',
     'VideoOcrAreaRatioFilter', 'VideoResolutionFilter',
     'VideoTaggingFromFramesFilter', 'VideoWatermarkFilter',
-    'WordRepetitionFilter', 'WordsNumFilter'
+    'WordRepetitionFilter', 'WordsNumFilter', 'VideoFaceRatioFilter'
 ]
 
 NON_STATS_FILTERS = [

diff --git a/data_juicer/ops/filter/video_face_ratio_filter.py b/data_juicer/ops/filter/video_face_ratio_filter.py
@@ -0,0 +1,139 @@
+import av
+import numpy as np
+from jsonargparse.typing import ClosedUnitInterval
+from data_juicer.utils.constant import Fields, StatsKeys
+from data_juicer.utils.mm_utils import (load_data_with_context, load_video,
+                                        pil_to_opencv, pil_to_opencv, process_each_frame)
+from ..base_op import OPERATORS, Filter
+from ..op_fusion import LOADED_VIDEOS
+
+import psutil
+import gc,os
+
+OP_NAME = 'video_face_ratio_filter'
+
+import cv2,dlib
+from PIL import ImageFilter
+
+@OPERATORS.register_module(OP_NAME)
+@LOADED_VIDEOS.register_module(OP_NAME)
+class VideoFaceRatioFilter(Filter):
+    """Keep data samples whose videos' durations are within a specified range.
+    """
+
+    def __init__(self,
+                 threshold: ClosedUnitInterval = 0.8,
+                 detect_interval: int = 1,
+                 any_or_all: str = 'all',
+                 *args,
+                 **kwargs):
+        """
+        Initialization method.
+
+        :param any_or_all: keep this sample with 'any' or 'all' strategy of
+            all videos. 'any': keep this sample if any videos meet the
+            condition. 'all': keep this sample only if all videos meet the
+            condition.
+        :param args: extra args
+        :param kwargs: extra args
+        """
+        super().__init__(*args, **kwargs)
+        self.threshold = threshold
+
+        if any_or_all not in ['any', 'all']:
+            raise ValueError(f'Keep strategy [{any_or_all}] is not supported. '
+                             f'Can only be one of ["any", "all"].')
+        self.any = (any_or_all == 'any')
+
+        # Initialize face detector
+        self.detector = dlib.get_frontal_face_detector()
+
+
+        self.detect_interval = detect_interval
+
+
+    def compute_stats_single(self, sample, rank=None, context=False):
+        # check if it's computed already
+        if StatsKeys.video_face_exist in sample[Fields.stats]:
+            return sample
+
+        # load videos
+        loaded_video_keys = sample[self.video_key]
+        video_faces_ratio = {}
+
+        # face_detect_S3FD = get_model(self.detector_key, rank=rank)
+
+        process = psutil.Process(os.getpid())
+        # memory_before = process.memory_info().rss / 1024 ** 2  # MB
+
+
+        for video_key in loaded_video_keys:
+            try:
+                with av.open(video_key) as container:
+                    # getting video stream
+                    video_stream = next(s for s in container.streams if s.type == 'video')
+                    # iterate over the video frame and detect faces
+                    frame_counter = 0  
+                    total_frames = 0
+                    frames_with_face = 0
+                    detect_num = 0
+                    for packet in container.demux(video_stream):
+                        try:
+                            for frame in packet.decode():
+                                total_frames += 1
+                                frame_counter += 1  
+
+                                if frame_counter % self.detect_interval == 0:
+                                    detect_num = detect_num + 1
+                                    img = frame.to_image()
+                                    image = pil_to_opencv(img)
+                                    # imageNumpy = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+                                    # faces = face_detect_S3FD.detect_faces(imageNumpy, conf_th=0.9, scales=[0.25])
+                                    faces = self.detector(image)
+                                    if len(faces) > 0:
+                                        frames_with_face += 1
+                        except Exception as e:
+                            print(f"Frame decoding error in video {video_key}: {e}")
+                            frames_with_face = 0
+                            detect_num = 0
+
+                    # calculate the proportion of the number of face frames
+                    if detect_num > 0:
+                        face_ratio = frames_with_face / detect_num
+                    else:
+                        face_ratio = 0.0
+                    video_faces_ratio[video_key] = face_ratio
+            except av.AVError as e:
+                print(f"Error opening video {video_key}: {e}")
+                video_faces_ratio[video_key] = 0.0
+            finally:
+                container.close()
+
+            video_faces_ratio[video_key] = face_ratio
+
+        # get video faces ratio
+        sample[Fields.stats][StatsKeys.video_face_exist] = [
+            video_faces_ratio[video_key] for video_key in sample[self.video_key]
+        ]
+
+        memory_after = process.memory_info().rss / 1024 ** 2  # MB
+        print(f"Memory Usage: {memory_after:.2f} MB")
+
+        gc.collect()
+
+        return sample
+
+    def process_single(self, sample):
+        video_faces_ratio = sample[Fields.stats][StatsKeys.video_face_exist]
+        keep_bools = np.array([
+            duration >= self.threshold
+            for duration in video_faces_ratio
+        ])
+        if len(keep_bools) <= 0:
+            return True
+
+        # different strategies
+        if self.any:
+            return keep_bools.any()
+        else:
+            return keep_bools.all()
diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py
@@ -73,6 +73,15 @@
 from .video_tagging_from_audio_mapper import VideoTaggingFromAudioMapper
 from .video_tagging_from_frames_mapper import VideoTaggingFromFramesMapper
 from .whitespace_normalization_mapper import WhitespaceNormalizationMapper
+from .video_active_speaker_mapper import VideoActiveSpeakerMapper
+from .video_audio_attribute_mapper import VideoAudioAttributeMapper
+from .video_audio_speech_ASR_mapper import VideoAudioSpeechASRMapper
+from .video_audio_speech_emotion_mapper import VideoAudioSpeechEmotionMapper
+from .video_captioning_face_attribute_emotion_mapper import VideoCaptioningFaceAttributeEmotionMapper
+from .video_captioning_from_human_tracks_mapper import VideoCaptioningFromHumanTracksMapper
+from .video_human_tracks_extraction_mapper import VideoHumanTracksExtractionMapper
+from .video_captioning_face_attribute_emotion_mapper import VideoCaptioningFaceAttributeEmotionMapper
+from .video_humantrack_face_demographic_mapper import VideoHumantrackFaceDemographicMapper
 
 __all__ = [
     'AudioFFmpegWrappedMapper', 'CalibrateQAMapper', 'CalibrateQueryMapper',
@@ -105,5 +114,9 @@
     'VideoResizeResolutionMapper', 'VideoSplitByDurationMapper',
     'VideoSplitByKeyFrameMapper', 'VideoSplitBySceneMapper',
     'VideoTaggingFromAudioMapper', 'VideoTaggingFromFramesMapper',
-    'WhitespaceNormalizationMapper'
+    'WhitespaceNormalizationMapper','VideoActiveSpeakerMapper',
+    'VideoAudioAttributeMapper', 'VideoAudioSpeechASRMapper',
+    'VideoCaptioningFaceAttributeEmotionMapper','VideoCaptioningFromHumanTracksMapper', 
+    'VideoHumanTracksExtractionMapper', 'VideoCaptioningFaceAttributeEmotionMapper',
+    'VideoHumantrackFaceDemographicMapper', 'VideoAudioSpeechEmotionMapper'
 ]