diff --git a/lmms_eval/models/model_utils/internvl_utils.py b/lmms_eval/models/model_utils/internvl_utils.py
new file mode 100644
index 000000000..c47b5b8bb
--- /dev/null
+++ b/lmms_eval/models/model_utils/internvl_utils.py
@@ -0,0 +1,133 @@
+import logging
+import numpy as np
+import torch
+from PIL import Image
+from decord import VideoReader, cpu
+
+
+eval_logger = logging.getLogger("eval_logger")
+
+
+def adaptive_keyframe_sampling(video_path: str, num_segments: int, query: str) -> np.ndarray:
+    """
+    Select frame indices relevant to the textual query using CLIP similarity.
+    Falls back to uniform sampling if CLIP loading fails due to PyTorch version issues.
+
+    Parameters
+    ----------
+    video_path : str
+        Path to the video file.
+    num_segments : int
+        Number of key frames to sample.
+    query : str
+        Text query describing the desired content.
+
+    Returns
+    -------
+    np.ndarray
+        Sorted array of selected frame indices.
+    """
+    vr = VideoReader(video_path, ctx=cpu(0))
+    total_frames = len(vr)
+
+    # Fallback to uniform indices when query is empty
+    if not query or not query.strip():
+        eval_logger.debug("adaptive_keyframe_sampling: Query is empty, using uniform sampling.")
+        return np.linspace(0, total_frames - 1, num_segments, dtype=int)
+
+    try:
+        # Try to load CLIP with safetensors support
+        from transformers import CLIPModel, CLIPProcessor
+
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        # Prefer a fast image processor but fall back to the slow version when necessary
+        def _load_processor(model_name: str):
+            from transformers.utils import logging as hf_logging
+
+            hf_logging.set_verbosity_error()
+            try:
+                proc = CLIPProcessor.from_pretrained(model_name, use_fast=True)
+                if not hasattr(proc.image_processor, "_valid_processor_keys"):
+                    raise AttributeError
+            except AttributeError:
+                logging.warning("⚠️ Fast image processor unsupported; using slow version")
+                proc = CLIPProcessor.from_pretrained(model_name, use_fast=False)
+
+            return proc
+
+        # Try multiple approaches to load CLIP safely
+        try:
+            model = CLIPModel.from_pretrained(
+                "openai/clip-vit-base-patch32",
+                use_safetensors=True,
+            ).to(device)
+            processor = _load_processor("openai/clip-vit-base-patch32")
+            load_msg = "✓ CLIP loaded with safetensors"
+        except Exception as e1:
+            try:
+                model = CLIPModel.from_pretrained(
+                    "openai/clip-vit-base-patch16",
+                    use_safetensors=True,
+                ).to(device)
+                processor = _load_processor("openai/clip-vit-base-patch16")
+                load_msg = "✓ CLIP loaded with clip-vit-base-patch16 and safetensors"
+            except Exception as e2:
+                try:
+                    model = CLIPModel.from_pretrained(
+                        "openai/clip-vit-base-patch32",
+                        torch_dtype=torch.float16,
+                        use_safetensors=True,
+                        trust_remote_code=False,
+                    ).to(device)
+                    processor = _load_processor("openai/clip-vit-base-patch32")
+                    load_msg = "✓ CLIP loaded with float16 and safetensors"
+                except Exception as e3:
+                    eval_logger.debug("All CLIP loading attempts failed:")
+                    eval_logger.debug(f"  Attempt 1 (safetensors): {e1}")
+                    eval_logger.debug(f"  Attempt 2 (patch16): {e2}")
+                    eval_logger.debug(f"  Attempt 3 (float16): {e3}")
+                    eval_logger.debug("Falling back to uniform sampling")
+                    return np.linspace(0, total_frames - 1, num_segments, dtype=int)
+        try:
+            text_inputs = processor(
+                text=query,
+                return_tensors="pt",
+                truncation=True,
+                max_length=77,
+                return_overflowing_tokens=True,
+            ).to(device)
+            overflow = text_inputs.pop("overflowing_tokens", None)
+            text_inputs.pop("num_truncated_tokens", None)
+            if overflow is not None and overflow.numel() > 0:
+                logging.warning("⚠️ query truncated to 77 tokens for CLIP")
+            with torch.no_grad():
+                text_features = model.get_text_features(**text_inputs)
+            text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+
+            candidate_indices = np.linspace(0, total_frames - 1, num_segments * 4, dtype=int)
+            frames = [Image.fromarray(vr[idx].asnumpy()) for idx in candidate_indices]
+            img_inputs = processor(images=frames, return_tensors="pt").to(device)
+            with torch.no_grad():
+                image_features = model.get_image_features(**img_inputs)
+            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+
+            scores = (image_features @ text_features.T).squeeze()
+            topk = scores.topk(num_segments).indices.cpu().numpy()
+            selected = np.sort(candidate_indices[topk])
+        except Exception as e:
+            eval_logger.debug(f"Adaptive sampling failed after CLIP load: {e}")
+            eval_logger.debug("Falling back to uniform sampling")
+            return np.linspace(0, total_frames - 1, num_segments, dtype=int)
+
+        eval_logger.debug(load_msg)
+        eval_logger.debug(f"\u2713 Adaptive sampling successful: selected {len(selected)} frames")
+        return selected
+    except ImportError as e:
+        eval_logger.debug(f"CLIP dependencies not available: {e}")
+        eval_logger.debug("Falling back to uniform sampling")
+        return np.linspace(0, total_frames - 1, num_segments, dtype=int)
+    except Exception as e:
+        eval_logger.debug(f"Adaptive sampling failed with error: {e}")
+        eval_logger.debug("Falling back to uniform sampling")
+        return np.linspace(0, total_frames - 1, num_segments, dtype=int)
diff --git a/lmms_eval/models/simple/internvl3.py b/lmms_eval/models/simple/internvl3.py
new file mode 100644
index 000000000..295f171bb
--- /dev/null
+++ b/lmms_eval/models/simple/internvl3.py
@@ -0,0 +1,646 @@
+import logging
+import math
+import json
+import os
+import random
+import atexit
+from datetime import timedelta
+from typing import List, Tuple
+
+import numpy as np
+import torch
+import torchvision.transforms as T
+from accelerate import Accelerator, DistributedType
+from accelerate.state import AcceleratorState
+from accelerate.utils import InitProcessGroupKwargs
+from decord import VideoReader, cpu
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+from tqdm import tqdm
+from transformers import AutoModel, AutoTokenizer
+
+from ..model_utils.internvl_utils import adaptive_keyframe_sampling
+
+from lmms_eval.api.instance import Instance
+from lmms_eval.api.model import lmms
+from lmms_eval.api.registry import register_model
+
+eval_logger = logging.getLogger("eval_logger")
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+# Set global seeds for reproducibility
+random.seed(0)
+np.random.seed(0)
+torch.manual_seed(0)
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = False
+
+DEFAULT_GEN_KWARGS = dict(
+    num_beams=1,
+    max_new_tokens=1024,
+    do_sample=False,
+    temperature=0.0,
+    top_p=1.0,
+    top_k=1,
+    repetition_penalty=1.0,
+)
+
+
+def build_transform(input_size):
+    """Build a transformation pipeline for image preprocessing.
+
+    Args:
+        input_size (int): The target size for the input images.
+
+    Returns:
+        T.Compose: The image transformation pipeline.
+    """
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose(
+        [
+            T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
+            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=MEAN, std=STD),
+        ]
+    )
+    return transform
+
+
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    """Find the closest aspect ratio from a set of target ratios.
+
+    Args:
+        aspect_ratio (float): The aspect ratio of the original image.
+        target_ratios (List[Tuple[int, int]]): A list of target aspect ratios to consider.
+        width (int): The width of the original image.
+        height (int): The height of the original image.
+        image_size (int): The size of the image after resizing.
+
+    Returns:
+        Tuple[int, int]: The closest aspect ratio as a tuple (width, height).
+    """
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
+    """Dynamically preprocess the input image by resizing and splitting it into multiple patches.
+
+    Args:
+        image (PIL.Image): The input image to preprocess.
+        min_num (int, optional): The minimum number of patches to create. Defaults to 1.
+        max_num (int, optional): The maximum number of patches to create. Defaults to 12.
+        image_size (int, optional): The size of the image after resizing. Defaults to 448.
+        use_thumbnail (bool, optional): Whether to use a thumbnail of the image. Defaults to False.
+
+    Returns:
+        List[PIL.Image]: A list of preprocessed image patches.
+    """
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set((i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+
+
+def load_image(image, input_size=448, max_num=12):
+    """Load and preprocess the input image.
+
+    Args:
+        image (PIL.Image): The input image to preprocess.
+        input_size (int, optional): The size of the image after resizing. Defaults to 448.
+        max_num (int, optional): The maximum number of patches to create. Defaults to 12.
+
+    Returns:
+        torch.Tensor: The preprocessed image tensor.
+    """
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=False, max_num=max_num)
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
+
+
+def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
+    """Get the frame indices for each segment.
+
+    Args:
+        bound (tuple): The start and end time bounds for the segment.
+        fps (float): The frames per second of the video.
+        max_frame (int): The maximum number of frames in the video.
+        first_idx (int, optional): The index of the first frame to include. Defaults to 0.
+        num_segments (int, optional): The number of segments to divide the video into. Defaults to 32.
+
+    Returns:
+        np.ndarray: The frame indices for each segment.
+    """
+    if bound:
+        start, end = bound[0], bound[1]
+    else:
+        start, end = -100000, 100000
+    start_idx = max(first_idx, round(start * fps))
+    end_idx = min(round(end * fps), max_frame)
+    seg_size = float(end_idx - start_idx) / num_segments
+    frame_indices = np.array([int(start_idx + (seg_size / 2) + np.round(seg_size * idx)) for idx in range(num_segments)])
+    return frame_indices
+
+
+def load_video(video_path, bound=None, input_size=448, max_num=6, num_segments=32, use_adaptive_sampling=True, query=None):
+    """Load and preprocess the input video - matching internvl_utils.py implementation
+
+    Args:
+        video_path (str): The path to the video file.
+        bound (tuple, optional): The start and end time bounds for the segment. Defaults to None.
+        input_size (int, optional): The size of the video frames after resizing. Defaults to 448.
+        max_num (int, optional): The maximum number of patches to create. Defaults to 6.
+        num_segments (int, optional): The number of segments to divide the video into. Defaults to 32.
+        use_adaptive_sampling (bool, optional): Whether to use adaptive keyframe sampling. Defaults to True.
+        query (str, optional): The query for adaptive sampling. Defaults to None.
+
+    Returns:
+        tuple: (pixel_values, num_patches_list, frame_times, video_length)
+    """
+    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+    max_frame = len(vr) - 1
+    fps = float(vr.get_avg_fps())
+    video_length = len(vr) / fps
+
+    pixel_values_list, num_patches_list, frame_times = [], [], []
+    transform = build_transform(input_size=input_size)
+
+    # Frame selection - match internvl_utils.py logic
+    if use_adaptive_sampling and query:
+        frame_indices = adaptive_keyframe_sampling(video_path, num_segments, query)
+    else:
+        frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
+
+    eval_logger.debug(f"DEBUG: load_video - video_path: {video_path}, total_frames: {len(vr)}, requested_segments: {num_segments}, selected_indices: {len(frame_indices)}, video_length: {video_length:.2f}s")
+
+    # Process frames like workshop
+    for frame_index in frame_indices:
+        img = Image.fromarray(vr[frame_index].asnumpy()).convert("RGB")
+        img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)  # Note: use_thumbnail=True like workshop
+        pixel_values = [transform(tile) for tile in img]
+        pixel_values = torch.stack(pixel_values)
+        num_patches_list.append(pixel_values.shape[0])
+        pixel_values_list.append(pixel_values)
+        frame_times.append(frame_index / fps)  # Match internvl_utils.py naming
+
+    pixel_values = torch.cat(pixel_values_list)
+    return pixel_values, num_patches_list, frame_times, video_length  # Return frame_times instead of image_times
+
+
+def split_model(model_name, num_layers=None):
+    """Split the model into smaller parts for distributed training.
+
+    Args:
+        model_name (str): The name of the model to split.
+        num_layers (int, optional): The number of layers in the model. Defaults to None.
+
+    Returns:
+        dict: A mapping of layer names to device IDs.
+    """
+    device_map = {}
+    world_size = torch.cuda.device_count()
+    if num_layers is None:
+        # InternVL3-8B uses Qwen2.5-7B which has 28 layers
+        num_layers = {
+            "InternVL3-1B": 16,  # Based on Qwen2.5-0.5B
+            "InternVL3-2B": 24,  # Based on Qwen2.5-1.5B
+            "InternVL3-8B": 28,  # Based on Qwen2.5-7B
+            "InternVL3-14B": 40,  # Based on Qwen2.5-14B
+            "InternVL3-38B": 64,  # Based on Qwen2.5-32B
+            "InternVL3-78B": 80,  # Based on Qwen2.5-72B
+        }.get(
+            model_name.split("/")[-1], 28  # Default to 28 for InternVL3-8B
+        )
+
+    # Since the first GPU will be used for ViT, treat it as half a GPU.
+    num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
+    num_layers_per_gpu = [num_layers_per_gpu] * world_size
+    num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
+    layer_cnt = 0
+    for i, num_layer in enumerate(num_layers_per_gpu):
+        for j in range(num_layer):
+            device_map[f"language_model.model.layers.{layer_cnt}"] = i
+            layer_cnt += 1
+    device_map["vision_model"] = 0
+    device_map["mlp1"] = 0
+    device_map["language_model.model.tok_embeddings"] = 0
+    device_map["language_model.model.embed_tokens"] = 0
+    device_map["language_model.output"] = 0
+    device_map["language_model.model.norm"] = 0
+    device_map["language_model.lm_head"] = 0
+    device_map[f"language_model.model.layers.{num_layers - 1}"] = 0
+
+    return device_map
+
+
+@register_model("internvl3")
+class InternVL3(lmms):
+    """InternVL3-8B model implementation for lmms-eval
+
+    Args:
+        lmms (lmms): The base lmms class.
+    """
+
+    def __init__(
+        self,
+        pretrained: str = "OpenGVLab/InternVL3-8B",
+        modality: str = "image",
+        device: str = "cuda:0",
+        device_map: str = "cuda:0",
+        batch_size: str = "1",
+        num_frame: int = 32,
+        dynamic_image_size=False,
+        use_temporal_context: bool = True,  # Enable enhanced temporal context by default
+        use_adaptive_sampling: bool = False,  # Disable adaptive sampling by default
+        num_layers=None,
+        max_num: int = 6,  # Maximum number of image tiles
+        **kwargs,
+    ):
+        super().__init__()
+
+        self.path = pretrained
+        self.num_frame = num_frame
+        self.max_num = max_num
+        self.use_temporal_context = use_temporal_context
+        self.use_adaptive_sampling = use_adaptive_sampling
+
+        batch_size = int(batch_size)
+        assert batch_size == 1, f"Batch size should be 1 for InternVL3, but got {batch_size}."
+        self.batch_size_per_gpu = batch_size
+
+        accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
+        accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
+        self.accelerator = accelerator
+        self.dynamic_image_size = dynamic_image_size
+
+        if accelerator.num_processes > 1:
+            self._device = torch.device(f"cuda:{accelerator.local_process_index}")
+            self.device_map = f"cuda:{accelerator.local_process_index}"
+        elif accelerator.num_processes == 1 and device_map == "auto":
+            self._device = torch.device(device)
+            device_map = split_model(pretrained, num_layers=num_layers)
+            self.device_map = device_map
+        else:
+            self._device = torch.device(f"cuda:{accelerator.local_process_index}")
+            self.device_map = f"cuda:{accelerator.local_process_index}"
+
+        # Load the model with trust_remote_code=True for InternVL3
+        self._model = AutoModel.from_pretrained(
+            self.path,
+            torch_dtype=torch.bfloat16,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+            device_map=self.device_map,
+        ).eval()
+
+        self._tokenizer = AutoTokenizer.from_pretrained(
+            self.path,
+            trust_remote_code=True,
+            use_fast=False,  # InternVL3 recommendation
+        )
+
+        if accelerator.num_processes > 1:
+            assert accelerator.distributed_type in [
+                DistributedType.FSDP,
+                DistributedType.MULTI_GPU,
+                DistributedType.DEEPSPEED,
+            ], "Unsupported distributed type provided. Only DDP and FSDP are supported."
+
+            if accelerator.distributed_type == DistributedType.DEEPSPEED:
+                kwargs = {
+                    "train_micro_batch_size_per_gpu": self.batch_size_per_gpu,
+                    "train_batch_size": self.batch_size_per_gpu * accelerator.num_processes,
+                }
+                AcceleratorState().deepspeed_plugin.deepspeed_config_process(must_match=True, **kwargs)
+                eval_logger.info("Detected DistributedType.DEEPSPEED. Make sure you run `accelerate config` and set zero stage to 0")
+
+            if accelerator.distributed_type == DistributedType.FSDP or accelerator.distributed_type == DistributedType.DEEPSPEED:
+                self._model = accelerator.prepare(self.model)
+            else:
+                self._model = accelerator.prepare_model(self.model, evaluation_mode=True)
+            self.accelerator = accelerator
+            if self.accelerator.is_local_main_process:
+                eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism")
+            self._rank = self.accelerator.local_process_index
+            self._world_size = self.accelerator.num_processes
+        elif accelerator.num_processes == 1 and device_map == "auto":
+            eval_logger.info(f"Using {accelerator.num_processes} devices with tensor parallelism")
+            self._rank = 0
+            self._world_size = 1
+        else:
+            eval_logger.info(f"Using single device: {self._device}")
+            self.model.to(self._device)
+            self._rank = 0
+            self._world_size = 1
+
+        self.modality = modality
+
+        # Initialize debug log file (disabled to avoid unwanted files)
+        self.debug_log_file = None
+        
+        # Initialize cache file tracking for cleanup
+        self._cache_file_path = "mammalps_enhanced_prompts.json"
+
+        # Register cleanup function to run at exit
+        atexit.register(self._cleanup_cache_file)
+
+    def _cleanup_cache_file(self):
+        """Helper method to clean up the cache file."""
+        try:
+            import os
+
+            if hasattr(self, "_cache_file_path") and os.path.exists(self._cache_file_path):
+                os.remove(self._cache_file_path)
+                eval_logger.debug(f"Cleaned up cache file: {self._cache_file_path}")
+        except Exception as e:
+            eval_logger.debug(f"Could not clean up cache file: {e}")
+
+    def __del__(self):
+        """Cleanup cache file when model instance is destroyed."""
+        self._cleanup_cache_file()
+
+    @property
+    def config(self):
+        return self._model.config
+
+    @property
+    def tokenizer(self):
+        return self._tokenizer
+
+    @property
+    def model(self):
+        if hasattr(self, "accelerator"):
+            return self.accelerator.unwrap_model(self._model)
+        else:
+            return self._model
+
+    @property
+    def batch_size(self):
+        return self.batch_size_per_gpu
+
+    @property
+    def device(self):
+        return self._device
+
+    @property
+    def rank(self):
+        return self._rank
+
+    @property
+    def world_size(self):
+        return self._world_size
+
+    def flatten(self, input):
+        new_list = []
+        for i in input:
+            for j in i:
+                new_list.append(j)
+        return new_list
+
+    def generate_until(self, requests) -> List[str]:
+        """Generate responses until a certain condition is met.
+
+        Args:
+            requests (List[Request]): The list of requests to process.
+
+        Raises:
+            ValueError: If the requests list is empty or invalid.
+
+        Returns:
+            List[str]: The generated responses.
+        """
+        res = []
+        pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="InternVL3 Responding")
+
+        for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
+            if "until" in gen_kwargs:
+                gen_kwargs.pop("until")
+            for k, v in DEFAULT_GEN_KWARGS.items():
+                if k not in gen_kwargs:
+                    gen_kwargs[k] = v
+
+            pop_keys = []
+            for k, v in gen_kwargs.items():
+                if k not in DEFAULT_GEN_KWARGS:
+                    pop_keys.append(k)
+
+            for k in pop_keys:
+                gen_kwargs.pop(k)
+
+            visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
+            visuals = self.flatten(visuals)
+
+            if self.modality == "image":
+                if visuals:
+                    visuals = [load_image(visual, max_num=self.max_num).to(torch.bfloat16).cuda() for visual in visuals]
+                    pixel_values = torch.cat(visuals, dim=0)
+                    num_patches_list = [visual.size(0) for visual in visuals]
+                    image_tokens = ["<image>"] * len(visuals)
+                    image_tokens = " ".join(image_tokens)
+                    contexts = image_tokens + "\n" + contexts
+                else:
+                    pixel_values = None
+                    num_patches_list = None
+
+                response, history = self.model.chat(
+                    self.tokenizer,
+                    pixel_values,
+                    contexts,
+                    gen_kwargs,
+                    num_patches_list=num_patches_list,
+                    history=None,
+                    return_history=True,
+                )
+
+            elif self.modality == "video":
+                assert len(visuals) == 1, f"Only one video is supported, but got {len(visuals)} videos."
+                video_path = visuals[0]
+                pixel_values, num_patches_list, frame_times, video_length = load_video(
+                    video_path,
+                    bound=None,
+                    input_size=448,
+                    max_num=self.max_num,  # Use the configured max_num
+                    num_segments=self.num_frame,
+                    use_adaptive_sampling=self.use_adaptive_sampling,  # Use the configured adaptive sampling setting
+                    query=contexts,  # Pass the query for potential adaptive sampling
+                )
+                pixel_values = pixel_values.to(torch.bfloat16).cuda()
+
+                # DEBUG: Print actual frame count and temporal info
+                eval_logger.debug(f"DEBUG: Video processing - requested frames: {self.num_frame}, actual frames: {len(num_patches_list)}, patches per frame: {num_patches_list}")
+                eval_logger.debug(f"DEBUG: Video temporal info - video_length: {video_length:.2f}s, frame_timestamps: {[f'{t:.2f}s' for t in frame_times[:5]]}{'...' if len(frame_times) > 5 else ''}")
+
+                # Create enhanced video prefix with temporal information
+                if hasattr(self, "use_temporal_context") and self.use_temporal_context:
+                    # Match exact format from animal_dataset.py
+                    special_tokens = "\n".join(["Frame-{} at second {:.2f}: <image>".format(i + 1, frame_times[i]) for i in range(len(num_patches_list))])
+                    special_tokens = "The video is {:.2f} second(s) long and you can see the frames below:\n".format(video_length) + special_tokens
+                    # Replace <video>\n exactly like animal_dataset.py
+                    question = contexts.replace("<video>\n", special_tokens + "\n")
+                    # Also handle case without newline for robustness
+                    if "<video>" in question and "<video>\n" not in contexts:
+                        question = contexts.replace("<video>", special_tokens + "\n")
+                else:
+                    # Standard version (current implementation)
+                    video_prefix = "".join([f"Frame{i + 1}: <image>\n" for i in range(len(num_patches_list))])
+                    question = video_prefix + contexts
+
+                # DEBUG: Log the complete final prompt with timestamps that will be sent to the model
+                eval_logger.debug(f"DEBUG: ===== FINAL PROMPT TO MODEL =====")
+                eval_logger.debug(f"DEBUG: Video path: {video_path}")
+                eval_logger.debug(f"DEBUG: Video length: {video_length:.2f}s, Frames: {len(num_patches_list)}")
+                eval_logger.debug(f"DEBUG: Prompt length: {len(question)} chars")
+                eval_logger.debug(f"DEBUG: Full prompt:")
+                eval_logger.debug(f"DEBUG: ----START----")
+                eval_logger.debug(question)
+                eval_logger.debug(f"DEBUG: ----END----")
+                eval_logger.debug(f"DEBUG: ================================")
+
+                # Save enhanced prompt with timestamps for utils.py to access
+                if hasattr(self, "use_temporal_context") and self.use_temporal_context:
+                    try:
+                        # Save the enhanced prompt to a cache file that utils.py can read
+                        prompt_cache_file = "mammalps_enhanced_prompts.json"
+                        prompt_cache = {}
+
+                        # Load existing cache if it exists
+                        if os.path.exists(prompt_cache_file):
+                            try:
+                                with open(prompt_cache_file, "r", encoding="utf-8") as f:
+                                    prompt_cache = json.load(f)
+                            except Exception:
+                                prompt_cache = {}
+
+                        # Create cache key from document info
+                        doc = self.task_dict[task][split][doc_id]
+                        cache_key = f"{doc.get('id', doc_id)}"
+
+                        # Determine subtask (animal, action, activity)
+                        subtask = None
+
+                        # Method 1: Try to get subtask from task config
+                        task_name = None
+                        if hasattr(self, "task") and hasattr(self.task, "_config") and hasattr(self.task._config, "task"):
+                            task_name = self.task._config.task
+                        elif hasattr(self, "_task_name"):
+                            task_name = self._task_name
+
+                        if task_name:
+                            eval_logger.debug(f"DEBUG: Task name from config: {task_name}")
+                            if "action" in task_name:
+                                subtask = "action"
+                            elif "activity" in task_name:
+                                subtask = "activity"
+                            elif "animal" in task_name:
+                                subtask = "animal"
+
+                        # Method 2: Robust inference from prompt content
+                        if not subtask:
+                            eval_logger.debug("DEBUG: No subtask from config, using content analysis")
+                            combined_text = contexts + " " + question
+
+                            # Look for specific label spaces and keywords
+                            if "activity label space" in combined_text or "activities label space" in combined_text:
+                                subtask = "activity"
+                            elif "action label space" in combined_text or "actions label space" in combined_text:
+                                subtask = "action"
+                            elif "animal label space" in combined_text or "animals label space" in combined_text:
+                                subtask = "animal"
+                            # Additional pattern matching
+                            elif "list of activities" in combined_text.lower() or "animal activities" in combined_text.lower():
+                                subtask = "activity"
+                            elif "list of actions" in combined_text.lower() or "animal actions" in combined_text.lower():
+                                subtask = "action"
+                            elif "list of animals" in combined_text.lower() or "animal types" in combined_text.lower():
+                                subtask = "animal"
+                            else:
+                                subtask = "unknown"
+
+                        eval_logger.debug(f"DEBUG: Detected subtask: {subtask} for doc_id: {cache_key}")
+                        eval_logger.debug(f"DEBUG: Content preview: {combined_text[:300]}...")
+
+                        # Store the enhanced prompt in a subtask-specific way
+                        if cache_key not in prompt_cache:
+                            prompt_cache[cache_key] = {}
+                        prompt_cache[cache_key][subtask] = {"original_prompt": contexts, "enhanced_prompt_with_timestamps": question, "video_length": video_length, "frame_timestamps": frame_times, "num_frames": len(num_patches_list)}
+
+                        eval_logger.debug(f"DEBUG: Saving enhanced prompt to cache for doc_id: {cache_key}, subtask: {subtask}")
+                        eval_logger.debug(f"DEBUG: Enhanced prompt preview: {question[:200]}...")
+
+                        # Save back to cache file
+                        with open(prompt_cache_file, "w", encoding="utf-8") as f:
+                            json.dump(prompt_cache, f, ensure_ascii=False, indent=2)
+
+                        eval_logger.debug(f"DEBUG: Successfully saved enhanced prompt to cache file: {prompt_cache_file}")
+
+                    except Exception as e:
+                        eval_logger.debug(f"DEBUG: Failed to save enhanced prompt to cache: {e}")
+
+                response, history = self.model.chat(
+                    self.tokenizer,
+                    pixel_values,
+                    question,
+                    gen_kwargs,
+                    num_patches_list=num_patches_list,
+                    history=None,
+                    return_history=True,
+                )
+
+            else:
+                raise ValueError(f"Unsupported modality: {self.modality}")
+
+            res.append(response)
+            pbar.update(1)
+        pbar.close()
+        return res
+
+    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+        assert False, "Not implemented yet."
+
+    def generate_until_multi_round(self, requests) -> List[str]:
+        raise NotImplementedError("TODO: Implement multi-round generation for InternVL3")
diff --git a/lmms_eval/tasks/animalkingdom/README.md b/lmms_eval/tasks/animalkingdom/README.md
new file mode 100644
index 000000000..43fc61ba5
--- /dev/null
+++ b/lmms_eval/tasks/animalkingdom/README.md
@@ -0,0 +1,82 @@
+# AnimalKingdom Tasks
+
+The `animalkingdom` directory defines evaluation tasks for the **AnimalKingdom** video dataset introduced in the [AnimalKingdom paper]( https://arxiv.org/abs/2204.08129).  
+AnimalKingdom targets video understanding of alpine wildlife across three subtasks:
+
+- **Animal recognition** – identify the species visible in a clip.
+- **Action recognition** – list fine‑grained movements or behaviors.
+- **Activity recognition** – describe higher‑level activities (e.g., foraging, resting).
+
+## Contents
+
+- **dataset_builder.py** – Converts `_cot.jsonl` annotations into Hugging Face–ready datasets and can optionally copy video clips. Supports predefined datasets (`MammAlps`, `mammalnet`, `AnimalKingdom`) or custom configurations.
+- **animalkingdom.yaml** – Groups the three AnimalKingdom subtasks so they can be run together.
+- **animalkingdom_animal.yaml**, **animalkingdom_action.yaml**, **animalkingdom_activity.yaml** – Task configs pointing to `luciehmct/AnimalKingdom`; they wire up helper functions from `utils.py` and score predictions with a strict Jaccard index.
+- **utils.py** – Shared utilities:
+  - `animalkingdom_doc_to_visual` downloads a clip if missing.
+  - `animalkingdom_doc_to_text` / `animalkingdom_doc_to_target` extract prompts and answers for each subtask.
+  - `animalkingdom_process_results` parses the “Final answer: [...]” list, logs predictions, and computes the Jaccard metric.
+  - `animalkingdom_jaccard_aggregation` averages scores across examples.
+
+## Evaluation
+
+Each subtask configuration loads the required records from  
+`luciehmct/animalkingdom-test` and runs helper functions in `utils.py`:
+
+- **`doc_to_visual`** → fetches the clip and supplies its path to the model.  
+- **`doc_to_text`** → provides the prompt for the current subtask.  
+- **`doc_to_target`** → returns the expected label list.  
+- **`process_results`** → parses the model response, collects the final answer list, and logs the prompt, response, ground truth, and Jaccard score to  
+  `results/<model>_<timestamp>/animalkingdom_<subtask>.jsonl`.  
+- **`animalkingdom_jaccard_metric`** → computes the strict Jaccard index.  
+- **`animalkingdom_jaccard_aggregation`** → averages scores across examples.  
+
+Example Commands
+```bash
+# 1. Action recognition with InternVL3
+python -m lmms_eval \
+  --model internvl3 \
+  --model_args "pretrained=OpenGVLab/InternVL3-8B,modality=video,num_frame=32,use_temporal_context=True" \
+  --tasks animalkingdom_action \
+  --batch_size 1 \
+  --output_path "$OUT_DIR" \
+  --verbosity INFO
+
+# 2. Animal recognition with InternVL3
+python -m lmms_eval \
+  --model internvl3 \
+  --model_args "pretrained=OpenGVLab/InternVL3-8B,modality=video,num_frame=32,use_temporal_context=True" \
+  --tasks animalkingdom_animal \
+  --batch_size 1 \
+  --output_path "$OUT_DIR"
+
+# 3. Activity recognition with InternVL3
+python -m lmms_eval \
+  --model internvl3 \
+  --model_args "pretrained=OpenGVLab/InternVL3-8B,modality=video,use_temporal_context=True" \
+  --tasks animalkingdom_activity \
+  --batch_size 1 \
+  --output_path "$OUT_DIR"
+
+# 4. Run all three AnimalKingdom subtasks together
+python -m lmms_eval \
+  --model internvl3 \
+  --model_args "pretrained=OpenGVLab/InternVL3-8B,modality=video,num_frame=32,use_temporal_context=True" \
+  --tasks animalkingdom \
+  --batch_size 1 \
+  --output_path "$OUT_DIR"
+```
+
+`use_temporal_context` is a flag in the InternVL3 model that lets you embed richer timing information in video prompts.
+
+When enabled (the default), each frame is annotated with its timestamp and the overall video length. The placeholder <video> in the original prompt is replaced by a list like “The video is L second(s) long… Frame‑1 at second t₁: <image>, …” so the model can reason about temporal relationships between frames. If disabled, frames are simply numbered (“Frame1: <image>”), without any notion of when they occur.
+
+
+## Sources
+
+- `lmms_eval/tasks/animalkingdom/dataset_builder.py`  
+- `lmms_eval/tasks/animalkingdom/animalkingdom.yaml`  
+- `lmms_eval/tasks/animalkingdom/animalkingdom_action.yaml`  
+- `lmms_eval/tasks/animalkingdom/animalkingdom_activity.yaml`  
+- `lmms_eval/tasks/animalkingdom/utils.py`
+
diff --git a/lmms_eval/tasks/animalkingdom/animalkingdom.yaml b/lmms_eval/tasks/animalkingdom/animalkingdom.yaml
new file mode 100644
index 000000000..f8b9647cf
--- /dev/null
+++ b/lmms_eval/tasks/animalkingdom/animalkingdom.yaml
@@ -0,0 +1,5 @@
+group: animalkingdom
+task:
+  - animalkingdom_animal
+  - animalkingdom_action  
+  - animalkingdom_activity
diff --git a/lmms_eval/tasks/animalkingdom/animalkingdom_action.yaml b/lmms_eval/tasks/animalkingdom/animalkingdom_action.yaml
new file mode 100644
index 000000000..f3c33c03e
--- /dev/null
+++ b/lmms_eval/tasks/animalkingdom/animalkingdom_action.yaml
@@ -0,0 +1,25 @@
+group: animalkingdom
+task: animalkingdom_action
+include: animalkingdom.yaml
+dataset_path: luciehmct/animalkingdom-test
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.animalkingdom_doc_to_visual
+doc_to_text: !function utils.animalkingdom_doc_to_text
+doc_to_target: !function utils.animalkingdom_doc_to_target
+process_results: !function utils.animalkingdom_process_results
+lmms_eval_specific_kwargs:
+  subtask: action
+generation_kwargs:
+  until:
+    - "<|im_end|>"
+  temperature: 0.0
+  top_p: 1.0
+  max_new_tokens: 512
+  do_sample: false
+metric_list:
+  - metric: jaccard
+    aggregation: !function utils.animalkingdom_jaccard_aggregation
+    higher_is_better: true
+metadata:
+  version: 1.0
\ No newline at end of file
diff --git a/lmms_eval/tasks/animalkingdom/animalkingdom_activity.yaml b/lmms_eval/tasks/animalkingdom/animalkingdom_activity.yaml
new file mode 100644
index 000000000..d4c4a3a34
--- /dev/null
+++ b/lmms_eval/tasks/animalkingdom/animalkingdom_activity.yaml
@@ -0,0 +1,25 @@
+group: animalkingdom
+task: animalkingdom_activity
+include: animalkingdom.yaml
+dataset_path: luciehmct/animalkingdom-test
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.animalkingdom_doc_to_visual
+doc_to_text: !function utils.animalkingdom_doc_to_text
+doc_to_target: !function utils.animalkingdom_doc_to_target
+process_results: !function utils.animalkingdom_process_results
+lmms_eval_specific_kwargs:
+  subtask: activity
+generation_kwargs:
+  until:
+    - "<|im_end|>"
+  temperature: 0.0
+  top_p: 1.0
+  max_new_tokens: 512
+  do_sample: false
+metric_list:
+  - metric: jaccard
+    aggregation: !function utils.animalkingdom_jaccard_aggregation
+    higher_is_better: true
+metadata:
+  version: 1.0
\ No newline at end of file
diff --git a/lmms_eval/tasks/animalkingdom/animalkingdom_animal.yaml b/lmms_eval/tasks/animalkingdom/animalkingdom_animal.yaml
new file mode 100644
index 000000000..6623a8e59
--- /dev/null
+++ b/lmms_eval/tasks/animalkingdom/animalkingdom_animal.yaml
@@ -0,0 +1,24 @@
+group: animalkingdom
+task: animalkingdom_animal
+dataset_path: luciehmct/animalkingdom-test
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.animalkingdom_doc_to_visual
+doc_to_text: !function utils.animalkingdom_doc_to_text
+doc_to_target: !function utils.animalkingdom_doc_to_target
+process_results: !function utils.animalkingdom_process_results
+lmms_eval_specific_kwargs:
+  subtask: animal
+generation_kwargs:
+  until:
+    - "<|im_end|>"
+  temperature: 0.0
+  top_p: 1.0
+  max_new_tokens: 512
+  do_sample: false
+metric_list:
+  - metric: jaccard
+    aggregation: !function utils.animalkingdom_jaccard_aggregation
+    higher_is_better: true
+metadata:
+  version: 1.0
\ No newline at end of file
diff --git a/lmms_eval/tasks/animalkingdom/dataset_builder.py b/lmms_eval/tasks/animalkingdom/dataset_builder.py
new file mode 100644
index 000000000..a2b65b103
--- /dev/null
+++ b/lmms_eval/tasks/animalkingdom/dataset_builder.py
@@ -0,0 +1,832 @@
+#!/usr/bin/env python3
+"""
+A configurable script to create datasets from _cot.jsonl files for any video dataset.
+Supports multiple tasks and flexible configuration.
+"""
+
+import json
+import re
+import ast
+import shutil
+import argparse
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+from collections import defaultdict
+from dataclasses import dataclass
+
+
+@dataclass
+class DatasetConfig:
+    """Configuration for dataset creation"""
+    name: str
+    base_dir: str
+    tasks: List[str]
+    output_prefix: str
+    include_conversations: bool = False
+    file_pattern: str = "{task}_recognition_cot.jsonl"
+    distribute_clips: bool = True  # Whether to copy video clips to output folders
+    
+    def get_task_file(self, split: str, task: str) -> Path:
+        """Get the file path for a specific task and split"""
+        filename = self.file_pattern.format(task=task)
+        return Path(self.base_dir) / split / filename
+    
+    def get_clips_dir(self) -> Path:
+        """Get the source clips directory"""
+        return Path(self.base_dir) / "clips"
+
+
+# Predefined configurations for known datasets
+DATASET_CONFIGS = {
+    "animalkingdom": DatasetConfig(
+        name="AnimalKingdom",
+        base_dir="AnimalKingdom_videos_annotations",
+        tasks=["action", "activity", "animal"],
+        output_prefix="animalkingdom",
+        include_conversations=False,
+        distribute_clips=True
+    ),
+    "mammalnet": DatasetConfig(
+        name="MammalNet", 
+        base_dir="MammalNet_videos_annotations",
+        tasks=["action", "animal"],
+        output_prefix="mammalnet",
+        include_conversations=False,
+        distribute_clips=False
+    ),
+    "mammalps": DatasetConfig(
+        name="MammAlps",
+        base_dir="MammAlps_videos_annotations",
+        tasks=["animal", "action", "activity"],
+        output_prefix="mammalps",
+        include_conversations=False,
+        distribute_clips=True
+    )
+}
+
+
+def extract_entities_from_gpt_response(response: str) -> List[str]:
+    """
+    Extract ground truth labels from GPT response.
+    Looks for 'Final answer: [...]' pattern and parses the list.
+    """
+    # Look for "Final answer:" followed by a list
+    final_answer_pattern = r"Final answer:\s*(\[.*?\])"
+    match = re.search(final_answer_pattern, response, re.IGNORECASE | re.DOTALL)
+    
+    if match:
+        try:
+            # Use ast.literal_eval for safe evaluation of Python list literals
+            list_str = match.group(1)
+            entities = ast.literal_eval(list_str)
+            
+            # Ensure it's a list of strings
+            if isinstance(entities, list):
+                return [str(entity).strip() for entity in entities if entity]
+            else:
+                return [str(entities)] if entities else []
+                
+        except (ValueError, SyntaxError) as e:
+            print(f"Warning: Could not parse list from response: {list_str}")
+            return []
+    
+    # Fallback: try eval for backwards compatibility
+    list_pattern = r"\[([^\]]+)\]"
+    matches = re.findall(list_pattern, response)
+    
+    for match in matches:
+        try:
+            if "'" in match or '"' in match:
+                entities = eval(f"[{match}]")
+                if isinstance(entities, list):
+                    return [str(entity).strip() for entity in entities]
+        except:
+            continue
+    
+    return []
+
+
+def extract_prompts_from_conversations(conversations: List[Dict]) -> str:
+    """Extract the human prompt from conversations."""
+    for conv in conversations:
+        if conv.get("from") == "human":
+            return conv.get("value", "")
+    return ""
+
+
+def add_task_example_to_prompt(prompt: str, task_type: str) -> str:
+    """Add the required example format to the prompt based on task type."""
+    
+    # Task-specific examples to append
+    task_examples = {
+        "action": "\n\nYour answer should follow the example below:\nstep 1\nactions = recognize(entity_type='action')\noutput:List[str]: ['eating', 'attending']\n\nstep 2\nreturn actions\noutput:Final answer: ['eating', 'attending']",
+        
+        "activity": "\n\nYour answer should follow the example below:\nstep 1\nactivities = recognize(entity_type='activity')\noutput:List[str]: ['foraging']\n\nstep 2\nreturn activities\noutput:Final answer: ['foraging']",
+        
+        "animal": "\n\nYour answer should follow the example below:\nstep 1\nanimals = recognize(entity_type='animal')\noutput:List[str]: ['common crane']\n\nstep 2\nreturn animals\noutput:Final answer: ['common crane']"
+    }
+    
+    # Append the example at the very end of the prompt
+    modified_prompt = prompt + task_examples.get(task_type, "")
+    
+    return modified_prompt
+
+
+def distribute_video_clips(dataset: List[Dict], config: DatasetConfig, split: str, output_dir: Path):
+    """Distribute video clips from source to output clips directory."""
+    if not config.distribute_clips:
+        return
+    
+    source_clips_dir = config.get_clips_dir()
+    if not source_clips_dir.exists():
+        print(f"Warning: Source clips directory not found: {source_clips_dir}")
+        return
+    
+    target_clips_dir = output_dir / "clips"
+    target_clips_dir.mkdir(exist_ok=True)
+    
+    copied_count = 0
+    missing_count = 0
+    
+    for record in dataset:
+        clip_path = record['clip']
+        
+        if config.name == "MammAlps":
+            # For MammalAlps, the clip path is already the full relative path
+            # e.g., "clips/S1_C3_E154_V0066_ID1_T1/S1_C3_E154_V0066_ID1_T1_c0.mp4"
+            source_file = config.get_clips_dir().parent / clip_path
+        else:
+            # For other datasets, try different approaches
+            video_id = record['video_id']
+            video_extensions = ['.mp4', '.avi', '.mov', '.mkv', '.webm']
+            source_file = None
+            
+            # First, try direct file in clips directory
+            for ext in video_extensions:
+                potential_source = source_clips_dir / f"{video_id}{ext}"
+                if potential_source.exists():
+                    source_file = potential_source
+                    break
+            
+            # If not found, try looking in subdirectories
+            if not source_file:
+                for subdir in source_clips_dir.iterdir():
+                    if subdir.is_dir() and video_id in subdir.name:
+                        for ext in video_extensions:
+                            potential_files = list(subdir.glob(f"*{ext}"))
+                            if potential_files:
+                                source_file = potential_files[0]
+                                break
+                        if source_file:
+                            break
+            
+            # If still not found, try a broader search
+            if not source_file:
+                for ext in video_extensions:
+                    potential_files = list(source_clips_dir.rglob(f"*{video_id}*{ext}"))
+                    if potential_files:
+                        source_file = potential_files[0]
+                        break
+        
+        if source_file and source_file.exists():
+            # For MammalAlps, preserve directory structure
+            if config.name == "MammAlps":
+                # Extract the relative path from clips/ onwards
+                rel_path = clip_path
+                if rel_path.startswith("clips/"):
+                    rel_path = rel_path[6:]  # Remove "clips/" prefix
+                
+                target_file = target_clips_dir / rel_path
+                target_file.parent.mkdir(parents=True, exist_ok=True)
+            else:
+                target_file = target_clips_dir / source_file.name
+            
+            if not target_file.exists():
+                try:
+                    shutil.copy2(source_file, target_file)
+                    copied_count += 1
+                except Exception as e:
+                    print(f"Warning: Could not copy {source_file} to {target_file}: {e}")
+        else:
+            missing_count += 1
+    
+    if copied_count > 0:
+        print(f"Copied {copied_count} video clips to {target_clips_dir}")
+    if missing_count > 0:
+        print(f"Warning: {missing_count} video clips not found in source directory")
+
+
+def process_jsonl_file(file_path: Path) -> List[Dict]:
+    """Process a JSONL file and return records with extracted data."""
+    records = []
+    
+    if not file_path.exists():
+        print(f"Warning: {file_path} not found")
+        return records
+    
+    with open(file_path, 'r', encoding='utf-8') as f:
+        for line_num, line in enumerate(f, 1):
+            line = line.strip()
+            if not line:
+                continue
+            
+            try:
+                data = json.loads(line)
+                
+                # Extract basic info
+                record = {
+                    'id': data.get('id'),
+                    'video': data.get('video', ''),
+                    'conversations': data.get('conversations', [])
+                }
+                
+                # Extract prompt and answer
+                prompt = extract_prompts_from_conversations(record['conversations'])
+                
+                # Extract answer from GPT response
+                gpt_response = ""
+                for conv in record['conversations']:
+                    if conv.get("from") == "gpt":
+                        gpt_response = conv.get("value", "")
+                        break
+                
+                answer = extract_entities_from_gpt_response(gpt_response)
+                
+                record['prompt'] = prompt
+                record['answer'] = answer
+                records.append(record)
+                
+            except json.JSONDecodeError as e:
+                print(f"Error parsing line {line_num} in {file_path}: {e}")
+                continue
+            except Exception as e:
+                print(f"Error processing line {line_num} in {file_path}: {e}")
+                continue
+    
+    return records
+
+
+def create_dataset(config: DatasetConfig, split: str = "test") -> List[Dict]:
+    """Create dataset for specified configuration and split."""
+    
+    print(f"Creating {config.name} {split} dataset...")
+    print(f"Tasks: {config.tasks}")
+    
+    # Process each task file
+    all_records = {}
+    
+    for task_name in config.tasks:
+        file_path = config.get_task_file(split, task_name)
+        print(f"Processing {task_name} task from {file_path}")
+        
+        records = process_jsonl_file(file_path)
+        
+        for record in records:
+            video_id = record['id']
+            video_path = record['video']
+            
+            # Handle different video path formats
+            if config.name == "MammAlps":
+                # For MammalAlps, video path is already relative to base directory
+                # e.g., "benchmark_1/clips/S1_C3_E154_V0066_ID1_T1/S1_C3_E154_V0066_ID1_T1_c0.mp4"
+                # We want to extract just the clips part: "clips/S1_C3_E154_V0066_ID1_T1/S1_C3_E154_V0066_ID1_T1_c0.mp4"
+                if "clips/" in video_path:
+                    clip_path = video_path[video_path.find("clips/"):]
+                    video_filename = Path(clip_path).name
+                    video_id_from_path = Path(video_filename).stem
+                else:
+                    clip_path = f'clips/{Path(video_path).name}'
+                    video_filename = Path(video_path).name
+                    video_id_from_path = Path(video_filename).stem
+            else:
+                # For other datasets, extract filename only
+                video_filename = Path(video_path).name
+                clip_path = f'clips/{video_filename}'
+                video_id_from_path = Path(video_filename).stem
+            
+            if video_id not in all_records:
+                all_records[video_id] = {
+                    'id': video_id,
+                    'clip': clip_path,
+                    'video_id': video_id_from_path,
+                }
+                
+                # Include conversations if specified in config
+                if config.include_conversations:
+                    all_records[video_id]['conversations'] = record['conversations']
+            
+            # Add task-specific data
+            all_records[video_id][task_name] = {
+                'prompt': add_task_example_to_prompt(record['prompt'], task_name),
+                'answer': record['answer']
+            }
+    
+    # Convert to list and filter complete records
+    dataset = []
+    for video_id, record in all_records.items():
+        # Check which tasks are present
+        present_tasks = [task for task in config.tasks if task in record]
+        
+        if present_tasks:  # Include record if it has at least one task
+            dataset.append(record)
+        else:
+            print(f"Warning: Record {video_id} has no valid tasks")
+    
+    # Sort by ID for consistency
+    dataset.sort(key=lambda x: x['id'])
+    
+    print(f"Created {split} dataset with {len(dataset)} records")
+    
+    # Verify task coverage
+    task_counts = defaultdict(int)
+    for record in dataset:
+        for task in config.tasks:
+            if task in record:
+                task_counts[task] += 1
+    
+    print(f"Task coverage: {dict(task_counts)}")
+    
+    return dataset
+
+
+def save_dataset(dataset: List[Dict], config: DatasetConfig, split: str):
+    """Save dataset with README and directory structure."""
+    
+    # Create output directory
+    output_dir = Path(f"{config.name}_HF_Dataset_{split.title()}")
+    output_dir.mkdir(exist_ok=True)
+    
+    # Create clips directory
+    (output_dir / 'clips').mkdir(exist_ok=True)
+    
+    # Distribute video clips if enabled
+    distribute_video_clips(dataset, config, split, output_dir)
+    
+    # Save dataset
+    output_file = output_dir / f"{config.output_prefix}_{split}_dataset.json"
+    with open(output_file, 'w') as f:
+        json.dump(dataset, f, indent=2)
+    
+    print(f"{split.title()} dataset saved to {output_file}")
+    
+    # Create README
+    readme_content = generate_readme(dataset, config, split)
+    readme_file = output_dir / 'README.md'
+    with open(readme_file, 'w') as f:
+        f.write(readme_content)
+    
+    print(f"README saved to {readme_file}")
+
+
+def save_unified_dataset(test_dataset: List[Dict], train_dataset: List[Dict], config: DatasetConfig):
+    """Save unified dataset with both train and test splits in one directory."""
+    
+    # Create unified output directory
+    output_dir = Path(f"{config.name}_HF_Dataset_Unified")
+    output_dir.mkdir(exist_ok=True)
+    
+    # Create subdirectories for each split
+    test_dir = output_dir / "test"
+    train_dir = output_dir / "train"
+    test_dir.mkdir(exist_ok=True)
+    train_dir.mkdir(exist_ok=True)
+    
+    # Create clips directories
+    (test_dir / 'clips').mkdir(exist_ok=True)
+    (train_dir / 'clips').mkdir(exist_ok=True)
+    
+    # Distribute video clips for each split
+    distribute_video_clips(test_dataset, config, "test", test_dir)
+    distribute_video_clips(train_dataset, config, "train", train_dir)
+    
+    # Save datasets
+    test_file = test_dir / f"{config.output_prefix}_test_dataset.json"
+    train_file = train_dir / f"{config.output_prefix}_train_dataset.json"
+    
+    with open(test_file, 'w') as f:
+        json.dump(test_dataset, f, indent=2)
+    
+    with open(train_file, 'w') as f:
+        json.dump(train_dataset, f, indent=2)
+    
+    print(f"Test dataset saved to {test_file}")
+    print(f"Train dataset saved to {train_file}")
+    
+    # Create unified README
+    readme_content = generate_unified_readme(test_dataset, train_dataset, config)
+    readme_file = output_dir / 'README.md'
+    with open(readme_file, 'w') as f:
+        f.write(readme_content)
+    
+    print(f"Unified README saved to {readme_file}")
+    print(f"Unified dataset created at {output_dir}")
+
+
+def generate_unified_readme(test_dataset: List[Dict], train_dataset: List[Dict], config: DatasetConfig) -> str:
+    """Generate README content for unified dataset."""
+    
+    # Calculate task statistics for both splits
+    test_task_counts = defaultdict(int)
+    train_task_counts = defaultdict(int)
+    
+    for record in test_dataset:
+        for task in config.tasks:
+            if task in record:
+                test_task_counts[task] += 1
+    
+    for record in train_dataset:
+        for task in config.tasks:
+            if task in record:
+                train_task_counts[task] += 1
+    
+    total_count = len(test_dataset) + len(train_dataset)
+    task_list = ", ".join([f"{task} recognition" for task in config.tasks])
+    
+    return f"""# {config.name} Dataset
+
+## Dataset Description
+
+This dataset contains video clips and annotations for {task_list} tasks in animal videos.
+
+## Dataset Statistics
+
+- **Total Records**: {total_count:,}
+- **Train Split**: {len(train_dataset):,} records
+- **Test Split**: {len(test_dataset):,} records
+- **Tasks**: {task_list}
+- **Format**: Task-centric with original prompts and List[str] answers
+
+## Dataset Structure
+
+```
+{config.output_prefix}/
+├── train/
+│   ├── {config.output_prefix}_train_dataset.json
+│   └── clips/
+│       ├── video1.mp4
+│       └── ...
+└── test/
+    ├── {config.output_prefix}_test_dataset.json
+    └── clips/
+        ├── video2.mp4
+        └── ...
+```
+
+## Task Coverage
+
+### Test Split
+{chr(10).join([f"- {task}: {test_task_counts[task]} records" for task in config.tasks])}
+
+### Train Split
+{chr(10).join([f"- {task}: {train_task_counts[task]} records" for task in config.tasks])}
+
+## Record Format
+
+```json
+{{
+  "id": "original_id",
+  "clip": "clips/video.mp4",
+  "video_id": "video_identifier",
+  {chr(10).join([f'  "{task}": {{"prompt": "...", "answer": ["..."]}}' for task in config.tasks])}
+}}
+```
+
+## Tasks
+
+{chr(10).join([f"- **{task.title()} Recognition**: Identify {task}s in animal videos" for task in config.tasks])}
+
+"""
+
+
+def save_unified_dataset_single_clips(test_dataset: List[Dict], train_dataset: List[Dict], config: DatasetConfig):
+    """Save unified dataset with both train and test splits using a single clips folder."""
+    
+    # Create unified output directory
+    output_dir = Path(f"{config.name}_HF_Dataset_Unified")
+    output_dir.mkdir(exist_ok=True)
+    
+    # Create single clips directory for all videos
+    clips_dir = output_dir / "clips"
+    clips_dir.mkdir(exist_ok=True)
+    
+    # Track copied clips to avoid duplicates
+    copied_clips = set()
+    
+    # Get existing dataset directories to copy clips from
+    test_source_dir = Path(f"{config.name}_HF_Dataset_Test")
+    train_source_dir = Path(f"{config.name}_HF_Dataset_Train")
+    
+    # Check if separate datasets exist, if not copy directly from source
+    if test_source_dir.exists() and train_source_dir.exists():
+        print("Using clips from existing separate datasets...")
+        
+        # Copy clips from test dataset
+        copied_count = 0
+        if test_dataset:
+            test_clips_source = test_source_dir / "clips"
+            if test_clips_source.exists():
+                for clip_file in test_clips_source.rglob("*"):
+                    if clip_file.is_file() and clip_file.suffix.lower() in ['.mp4', '.avi', '.mov', '.mkv', '.webm']:
+                        # Preserve directory structure relative to clips directory
+                        relative_path = clip_file.relative_to(test_clips_source)
+                        dest_file = clips_dir / relative_path
+                        dest_file.parent.mkdir(parents=True, exist_ok=True)
+                        
+                        # Copy if not already copied
+                        if str(relative_path) not in copied_clips:
+                            shutil.copy2(clip_file, dest_file)
+                            copied_clips.add(str(relative_path))
+                            copied_count += 1
+                print(f"Copied {len(list(test_clips_source.rglob('*.mp4')))} test clips")
+        
+        # Copy clips from train dataset (skip duplicates)
+        if train_dataset:
+            train_clips_source = train_source_dir / "clips"
+            if train_clips_source.exists():
+                train_count = 0
+                for clip_file in train_clips_source.rglob("*"):
+                    if clip_file.is_file() and clip_file.suffix.lower() in ['.mp4', '.avi', '.mov', '.mkv', '.webm']:
+                        # Preserve directory structure relative to clips directory
+                        relative_path = clip_file.relative_to(train_clips_source)
+                        dest_file = clips_dir / relative_path
+                        dest_file.parent.mkdir(parents=True, exist_ok=True)
+                        
+                        # Copy if not already copied
+                        if str(relative_path) not in copied_clips:
+                            shutil.copy2(clip_file, dest_file)
+                            copied_clips.add(str(relative_path))
+                            train_count += 1
+                print(f"Copied {train_count} new train clips")
+    else:
+        print("Separate datasets not found, copying directly from source...")
+        
+        # Copy clips for each dataset separately to maintain proper structure
+        copied_count = 0
+        
+        if test_dataset:
+            print("Copying test clips...")
+            distribute_video_clips(test_dataset, config, "test", output_dir)
+            copied_count += len(test_dataset)
+        
+        if train_dataset:
+            print("Copying train clips...")
+            distribute_video_clips(train_dataset, config, "train", output_dir)
+            copied_count += len(train_dataset)
+        
+        # Count actual copied clips
+        actual_clips = len(list(clips_dir.rglob('*.mp4')))
+        print(f"Successfully copied {actual_clips} clips directly from source")
+    
+    print(f"Total clips in unified dataset: {len(list(clips_dir.rglob('*.mp4')))}")
+    
+    # The datasets already have correct clip paths pointing to clips/ directory
+    # No need to update paths since they're already correct
+    
+    # Save JSON files at root level
+    if test_dataset:
+        test_file = output_dir / f"{config.output_prefix}_test_dataset.json"
+        with open(test_file, 'w') as f:
+            json.dump(test_dataset, f, indent=2)
+        print(f"Test dataset saved to {test_file}")
+    
+    if train_dataset:
+        train_file = output_dir / f"{config.output_prefix}_train_dataset.json"
+        with open(train_file, 'w') as f:
+            json.dump(train_dataset, f, indent=2)
+        print(f"Train dataset saved to {train_file}")
+    
+    # Create unified README
+    readme_content = generate_unified_single_clips_readme(test_dataset, train_dataset, config)
+    readme_file = output_dir / 'README.md'
+    with open(readme_file, 'w') as f:
+        f.write(readme_content)
+    
+    print(f"Unified README saved to {readme_file}")
+    print(f"Unified dataset with single clips folder created at {output_dir}")
+
+
+def generate_unified_single_clips_readme(test_dataset: List[Dict], train_dataset: List[Dict], config: DatasetConfig) -> str:
+    """Generate README content for unified dataset with single clips folder."""
+    
+    # Calculate task statistics for both splits
+    test_task_counts = defaultdict(int)
+    train_task_counts = defaultdict(int)
+    
+    if test_dataset:
+        for record in test_dataset:
+            for task in config.tasks:
+                if task in record:
+                    test_task_counts[task] += 1
+    
+    if train_dataset:
+        for record in train_dataset:
+            for task in config.tasks:
+                if task in record:
+                    train_task_counts[task] += 1
+    
+    total_count = len(test_dataset or []) + len(train_dataset or [])
+    task_list = ", ".join([f"{task} recognition" for task in config.tasks])
+    
+    return f"""# {config.name} Dataset
+
+## Dataset Description
+
+This dataset contains video clips and annotations for {task_list} tasks in animal videos.
+
+## Dataset Statistics
+
+- **Total Records**: {total_count:,}
+- **Train Split**: {len(train_dataset or []):,} records
+- **Test Split**: {len(test_dataset or []):,} records
+- **Tasks**: {task_list}
+- **Format**: Task-centric with original prompts and List[str] answers
+
+## Dataset Structure
+
+```
+{config.output_prefix}/
+├── {config.output_prefix}_train_dataset.json
+├── {config.output_prefix}_test_dataset.json
+├── clips/
+│   ├── video1.mp4
+│   ├── video2.mp4
+│   └── ...
+└── README.md
+```
+
+## Task Coverage
+
+### Test Split
+{chr(10).join([f"- {task}: {test_task_counts[task]} records" for task in config.tasks]) if test_dataset else "No test data"}
+
+### Train Split
+{chr(10).join([f"- {task}: {train_task_counts[task]} records" for task in config.tasks]) if train_dataset else "No train data"}
+
+## Record Format
+
+```json
+{{
+  "id": "original_id",
+  "clip": "clips/video.mp4",
+  "video_id": "video_identifier",
+  {chr(10).join([f'  "{task}": {{"prompt": "...", "answer": ["..."]}}' for task in config.tasks])}
+}}
+```
+
+## Tasks
+
+{chr(10).join([f"- **{task.title()} Recognition**: Identify {task}s in animal videos" for task in config.tasks])}
+
+"""
+
+
+def _get_size_category(count: int) -> str:
+    """Get size category for dataset."""
+    if count < 1000:
+        return "n<1K"
+    elif count < 10000:
+        return "1K<n<10K"
+    elif count < 100000:
+        return "10K<n<100K"
+    elif count < 1000000:
+        return "100K<n<1M"
+    else:
+        return "n>1M"
+
+
+def generate_readme(dataset: List[Dict], config: DatasetConfig, split: str) -> str:
+    """Generate README content for the dataset."""
+    
+    # Calculate task statistics
+    task_counts = defaultdict(int)
+    for record in dataset:
+        for task in config.tasks:
+            if task in record:
+                task_counts[task] += 1
+    
+    # Task list for documentation
+    task_list = ", ".join([f"{task} recognition" for task in config.tasks])
+    
+    return f"""# {config.name} {split.title()} Dataset
+
+## Dataset Information
+- Records: {len(dataset)}
+- Tasks: {task_list}
+- Format: Task-centric with original prompts and List[str] answers
+
+## Task Coverage
+{chr(10).join([f"- {task}: {task_counts[task]} records" for task in config.tasks])}
+
+## Structure
+Each record contains:
+- id: Original ID from source data
+- clip: Path to video clip (to be populated)
+- video_id: Video identifier
+{chr(10).join([f"- {task}: {task.title()} recognition task with prompt and answer" for task in config.tasks])}
+{"- conversations: Original conversation data for evaluation compatibility" if config.include_conversations else ""}
+
+## Source
+Created from {config.name} conversation data (_cot.jsonl files) with ground truth labels extracted from GPT responses.
+"""
+
+
+def main():
+    """Main function with command-line argument support."""
+    
+    parser = argparse.ArgumentParser(description="Universal Dataset Builder")
+    parser.add_argument("--dataset", "-d", choices=list(DATASET_CONFIGS.keys()) + ["all"], 
+                       default="all", help="Dataset to build (default: all)")
+    parser.add_argument("--split", "-s", choices=["test", "train", "both"], 
+                       default="both", help="Split to build (default: both)")
+    parser.add_argument("--unified", "-u", action="store_true", 
+                       help="Create unified dataset with both splits in one directory")
+    
+    args = parser.parse_args()
+    
+    print("Universal Dataset Builder")
+    print("=" * 50)
+    
+    # Determine which datasets to build
+    if args.dataset == "all":
+        datasets_to_build = list(DATASET_CONFIGS.keys())
+    else:
+        datasets_to_build = [args.dataset]
+    
+    # List available datasets
+    print("Available datasets:")
+    for key, config in DATASET_CONFIGS.items():
+        print(f"  {key}: {config.name} ({', '.join(config.tasks)})")
+    
+    print(f"\nBuilding datasets: {datasets_to_build}")
+    print(f"Splits: {args.split}")
+    print(f"Unified: {args.unified}")
+    
+    for dataset_key in datasets_to_build:
+        config = DATASET_CONFIGS[dataset_key]
+        print(f"\n{'=' * 60}")
+        print(f"Building {config.name} Dataset")
+        print(f"{'=' * 60}")
+        
+        test_dataset = None
+        train_dataset = None
+        
+        # Create datasets based on split argument
+        if args.split in ["test", "both"]:
+            print(f"\n--- {config.name} Test Dataset ---")
+            test_dataset = create_dataset(config, "test")
+        
+        if args.split in ["train", "both"]:
+            print(f"\n--- {config.name} Train Dataset ---")
+            train_dataset = create_dataset(config, "train")
+        
+        # Save datasets
+        if args.unified and test_dataset is not None and train_dataset is not None:
+            # Save unified dataset with single clips folder
+            save_unified_dataset_single_clips(test_dataset, train_dataset, config)
+        else:
+            # Save separate datasets
+            if test_dataset is not None:
+                save_dataset(test_dataset, config, "test")
+            if train_dataset is not None:
+                save_dataset(train_dataset, config, "train")
+        
+        # Print summary
+        print(f"\n{config.name} datasets complete!")
+        if test_dataset:
+            print(f"   Test: {len(test_dataset)} records")
+        if train_dataset:
+            print(f"   Train: {len(train_dataset)} records")
+
+
+def build_custom_dataset(name: str, base_dir: str, tasks: List[str], 
+                        include_conversations: bool = False, 
+                        output_prefix: Optional[str] = None,
+                        distribute_clips: bool = True):
+    """Build a custom dataset with specified parameters."""
+    
+    config = DatasetConfig(
+        name=name,
+        base_dir=base_dir,
+        tasks=tasks,
+        output_prefix=output_prefix or name.lower(),
+        include_conversations=include_conversations,
+        distribute_clips=distribute_clips
+    )
+    
+    print(f"Building custom dataset: {name}")
+    
+    # Create test dataset
+    test_dataset = create_dataset(config, "test")
+    save_dataset(test_dataset, config, "test")
+    
+    # Create train dataset  
+    train_dataset = create_dataset(config, "train")
+    save_dataset(train_dataset, config, "train")
+    
+    return test_dataset, train_dataset
+
+
+if __name__ == "__main__":
+    main()
+
+
diff --git a/lmms_eval/tasks/animalkingdom/utils.py b/lmms_eval/tasks/animalkingdom/utils.py
new file mode 100644
index 000000000..d4f9b19d7
--- /dev/null
+++ b/lmms_eval/tasks/animalkingdom/utils.py
@@ -0,0 +1,581 @@
+import re
+import os
+import json
+
+from loguru import logger as eval_logger
+
+import ast
+
+# Compile once; case-insensitive; spans newlines
+_FINAL_ANSWER_RE = re.compile(r"Final\s*answer\s*:\s*(\[[^\]]*\])", re.IGNORECASE | re.DOTALL)
+
+
+def _extract_final_answer_list(response):
+    """
+    Strictly extract the LAST 'Final answer: [...]' list.
+    Returns [] if the anchor is missing or parsing fails.
+    """
+    if not isinstance(response, str):
+        return []
+
+    matches = list(_FINAL_ANSWER_RE.finditer(response))
+    if not matches:
+        eval_logger.warning("Strict mode: 'Final answer:' anchor not found; returning empty list.")
+        return []
+
+    raw_list = matches[-1].group(1)  # last occurrence
+
+    # Safe parse (no eval)
+    parsed = None
+    try:
+        parsed = ast.literal_eval(raw_list)
+    except Exception as e1:
+        try:
+            parsed = json.loads(raw_list.replace("'", '"'))
+        except Exception as e2:
+            eval_logger.warning(f"Strict mode: could not parse Final answer list. " f"literal_eval={e1}; json={e2}")
+            return []
+
+    if not isinstance(parsed, list):
+        eval_logger.warning("Strict mode: Final answer parsed but not a list. Returning empty list.")
+        return []
+
+    # Clean + dedupe (preserve order)
+    out, seen = [], set()
+    for item in parsed:
+        s = str(item).strip()
+        if s and s not in seen:
+            seen.add(s)
+            out.append(s)
+    return out
+
+
+def animalkingdom_doc_to_visual(doc, lmms_eval_specific_kwargs=None):
+    """Extract visual information from the document, downloading it from HuggingFace if necessary.
+
+    Args:
+        doc (dict): A dictionary representing the document, expected to contain a "clip" key with the path or filename of the video file.
+        lmms_eval_specific_kwargs (dict, optional): Additional keyword arguments specific to LMMS evaluation. Defaults to None.
+
+    Returns:
+        list[str]: A list containing the local file path to the video clip. If the file cannot be found or downloaded, returns the original path.
+    """
+    clip_path = doc["clip"]
+
+    # If it's already an absolute path and exists, use it
+    if os.path.isabs(clip_path) and os.path.exists(clip_path):
+        return [clip_path]
+
+    # Download from HuggingFace dataset repository
+    try:
+        from huggingface_hub import hf_hub_download
+
+        # Download the video file from the HuggingFace dataset repository
+        local_path = hf_hub_download(repo_id="luciehmct/animalkingdom-test", filename=clip_path, repo_type="dataset")
+
+        if os.path.exists(local_path):
+            return [local_path]
+        else:
+            eval_logger.error(f"Downloaded file does not exist: {local_path}")
+            return [clip_path]
+
+    except Exception as e:
+        eval_logger.error(f"Failed to download video {clip_path}: {str(e)}")
+        return [clip_path]
+
+
+def animalkingdom_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    """Extract the prompt from the new nested dataset structure.
+
+    Args:
+        doc (dict): A dictionary representing the document with nested structure for each subtask.
+        lmms_eval_specific_kwargs (dict, optional): Configuration parameters containing:
+            - 'subtask': String specifying the task type ('animal', 'action', or 'activity')
+            Defaults to None, which results in 'animal' subtask being used.
+
+    Returns:
+        str: The prompt for the specified animalkingdom subtask.
+    """
+    if lmms_eval_specific_kwargs is None:
+        lmms_eval_specific_kwargs = {}
+
+    subtask = lmms_eval_specific_kwargs.get("subtask", "animal")
+
+    # DEBUG: Log the initial state
+    eval_logger.debug(f"doc_to_text called with subtask from kwargs: {subtask}")
+    eval_logger.debug(f"doc_to_text kwargs: {lmms_eval_specific_kwargs}")
+    eval_logger.debug(f"doc_to_text - clip_path: {doc.get('clip', 'Unknown')}")
+    eval_logger.debug(f"doc_to_text - question_id: {doc.get('id', 'Unknown')}")
+
+    # Detect the subtask from the call stack
+    # since lmms_eval_specific_kwargs may not contain the subtask
+    import inspect
+
+    detected_subtask = subtask
+
+    try:
+        frame = inspect.currentframe()
+        while frame:
+            frame_locals = frame.f_locals
+
+            # Look for task-related variables in the call stack
+            if "task" in frame_locals:
+                task_obj = frame_locals["task"]
+                if hasattr(task_obj, "_config") and hasattr(task_obj._config, "task"):
+                    task_name = task_obj._config.task
+                    eval_logger.debug(f"Found task name in doc_to_text: {task_name}")
+                    if "action" in task_name:
+                        detected_subtask = "action"
+                    elif "activity" in task_name:
+                        detected_subtask = "activity"
+                    elif "animal" in task_name:
+                        detected_subtask = "animal"
+                    break
+
+            # Additional check for task name in frame
+            if "self" in frame_locals and hasattr(frame_locals["self"], "_config"):
+                config = frame_locals["self"]._config
+                if hasattr(config, "task"):
+                    task_name = config.task
+                    eval_logger.debug(f"Found task name string in doc_to_text: {task_name}")
+                    if "action" in task_name:
+                        detected_subtask = "action"
+                    elif "activity" in task_name:
+                        detected_subtask = "activity"
+                    elif "animal" in task_name:
+                        detected_subtask = "animal"
+
+            frame = frame.f_back
+    except Exception as e:
+        eval_logger.warning(f"Warning: Could not extract task from stack in doc_to_text: {e}")
+
+    # PRIORITIZE explicit subtask from kwargs if it's valid
+    if subtask in ["action", "activity", "animal"] and subtask in doc:
+        detected_subtask = subtask
+        eval_logger.debug(f"Using explicit subtask from kwargs: {detected_subtask}")
+    else:
+        eval_logger.debug(f"Using detected subtask from call stack: {detected_subtask}")
+
+    # Use the detected subtask
+    eval_logger.debug(f"Final determined subtask for prompt: {detected_subtask}")
+
+    # Extract prompt from the nested structure
+    if detected_subtask in doc and "prompt" in doc[detected_subtask]:
+        prompt = doc[detected_subtask]["prompt"]
+        eval_logger.debug(f"Successfully found {detected_subtask} prompt - length: {len(prompt)} chars")
+
+        return prompt
+    else:
+        eval_logger.error(f"No prompt found for subtask '{detected_subtask}' in document")
+        eval_logger.error(f"Available subtasks in doc: {list(doc.keys())}")
+        return ""
+
+
+def animalkingdom_doc_to_target(doc, lmms_eval_specific_kwargs=None):
+    """Extract target answer from the new nested dataset structure.
+
+    Args:
+        doc (dict): A dictionary representing the document with nested structure for each subtask.
+        lmms_eval_specific_kwargs (dict, optional): Configuration parameters containing:
+            - 'subtask': String specifying the task type ('animal', 'action', or 'activity')
+            Defaults to None, which results in 'animal' subtask being used.
+
+    Returns:
+        list: The target answer for the specified animalkingdom subtask. Returns:
+            - Animal target if subtask is 'animal'
+            - Action target if subtask is 'action'
+            - Activity target if subtask is 'activity'
+            - Empty list if subtask is not recognized or no target found
+    """
+    if lmms_eval_specific_kwargs is None:
+        lmms_eval_specific_kwargs = {}
+
+    subtask = lmms_eval_specific_kwargs.get("subtask", "animal")
+
+    # Detect the subtask from the call stack
+    # since lmms_eval_specific_kwargs may not contain the subtask
+    import inspect
+
+    detected_subtask = subtask
+
+    try:
+        frame = inspect.currentframe()
+        while frame:
+            frame_locals = frame.f_locals
+
+            # Look for task-related variables in the call stack
+            if "task" in frame_locals:
+                task_obj = frame_locals["task"]
+                if hasattr(task_obj, "_config") and hasattr(task_obj._config, "task"):
+                    task_name = task_obj._config.task
+                    eval_logger.debug(f"Found task name: {task_name}")
+                    if "action" in task_name:
+                        detected_subtask = "action"
+                    elif "activity" in task_name:
+                        detected_subtask = "activity"
+                    elif "animal" in task_name:
+                        detected_subtask = "animal"
+                    break
+
+            # Additional check for task name in frame
+            if "self" in frame_locals and hasattr(frame_locals["self"], "_config"):
+                config = frame_locals["self"]._config
+                if hasattr(config, "task"):
+                    task_name = config.task
+                    eval_logger.debug(f"Found task name string: {task_name}")
+                    if "action" in task_name:
+                        detected_subtask = "action"
+                    elif "activity" in task_name:
+                        detected_subtask = "activity"
+                    elif "animal" in task_name:
+                        detected_subtask = "animal"
+
+            frame = frame.f_back
+    except Exception as e:
+        eval_logger.warning(f"Warning: Could not extract task from stack in doc_to_target: {e}")
+
+    eval_logger.debug(f"Final determined subtask: {detected_subtask}")
+
+    # DEBUG: Log what subtask we are extracting target for
+    eval_logger.debug(f"doc_to_target called with subtask: {detected_subtask}")
+    eval_logger.debug(f"doc_to_target - clip_path: {doc.get('clip', 'Unknown')}")
+    eval_logger.debug(f"doc_to_target - question_id: {doc.get('id', 'Unknown')}")
+
+    # Extract target from the nested structure
+    if detected_subtask in doc and "answer" in doc[detected_subtask]:
+        target = doc[detected_subtask]["answer"]
+        eval_logger.debug(f"Returning {detected_subtask} target from nested structure: {target}")
+        return target
+    else:
+        eval_logger.error(f"No answer found for subtask '{detected_subtask}' in document")
+        return []
+
+
+def animalkingdom_jaccard_metric(predictions, references):
+    """Calculate the Jaccard similarity between predicted and reference labels,
+    using a STRICT extractor anchored on the last 'Final answer: [...]' only.
+    """
+    from lmms_eval.tasks.megabench.metrics.scoring.common.metrics import jaccard_index
+    import re
+    import ast
+    import json
+
+    # Choose the response payload
+    if isinstance(predictions, list) and len(predictions) > 0:
+        response = predictions[0]
+    else:
+        response = predictions
+
+    # Strict extraction from "Final answer: [...]"
+    if isinstance(response, str):
+        predicted_labels = []
+
+        # Find the LAST 'Final answer: [...]' (case-insensitive, spans newlines)
+        matches = list(re.finditer(r"Final\s*answer\s*:\s*(\[[^\]]*\])", response, re.IGNORECASE | re.DOTALL))
+        if not matches:
+            eval_logger.error("Strict mode: 'Final answer:' anchor not found; using empty list.")
+        else:
+            raw_list = matches[-1].group(1)
+            try:
+                predicted_labels = ast.literal_eval(raw_list)  # safe parse
+            except Exception as e1:
+                # JSON-like fallback (single → double quotes)
+                try:
+                    predicted_labels = json.loads(raw_list.replace("'", '"'))
+                except Exception as e2:
+                    eval_logger.error(f"Strict mode: could not parse Final answer list. " f"literal_eval={e1}; json={e2}")
+                    predicted_labels = []
+
+            if not isinstance(predicted_labels, list):
+                eval_logger.error("Strict mode: Final answer parsed but not a list; using empty list.")
+                predicted_labels = []
+            else:
+                # Clean & dedupe while preserving order
+                seen, cleaned = set(), []
+                for item in predicted_labels:
+                    s = str(item).strip()
+                    if s and s not in seen:
+                        seen.add(s)
+                        cleaned.append(s)
+                predicted_labels = cleaned
+
+        parsed_prediction = predicted_labels
+    else:
+        # If predictions is already a list (pre-parsed), use it directly
+        parsed_prediction = predictions if isinstance(predictions, list) else [predictions]
+
+    # Normalize references to a list
+    if isinstance(references, str):
+        references = [references]
+
+    # Calculate jaccard index
+    jaccard_score = jaccard_index(parsed_prediction, references)
+    eval_logger.debug(f"Jaccard calculation: pred={parsed_prediction}, ref={references}, score={jaccard_score}")
+
+    return {"animalkingdom_jaccard": jaccard_score}
+
+
+def animalkingdom_process_results(doc, results, lmms_eval_specific_kwargs=None):
+    """Process model results to extract labels strictly from 'Final answer: [...]'."""
+    if not results or len(results) == 0:
+        eval_logger.warning("No results provided to process_results")
+        return []
+
+    response = results[0].strip()
+    subtask = lmms_eval_specific_kwargs.get("subtask", "animal") if lmms_eval_specific_kwargs else "animal"
+
+    # DEBUG: Log clip_path and question_id for extraction from logs
+    eval_logger.debug(f"process_results - clip_path: {doc.get('clip', 'Unknown')}")
+    eval_logger.debug(f"process_results - full_model_response: {repr(response)}")
+
+    # Try to detect the subtask from the call stack
+    import inspect
+
+    detected_subtask = subtask
+    try:
+        frame = inspect.currentframe()
+        while frame:
+            frame_locals = frame.f_locals
+
+            # Look for task-related variables in the call stack
+            if "task" in frame_locals:
+                task_obj = frame_locals["task"]
+                if hasattr(task_obj, "_config") and hasattr(task_obj._config, "task"):
+                    task_name = task_obj._config.task
+                    eval_logger.debug(f"Found task name in process_results: {task_name}")
+                    if "action" in task_name:
+                        detected_subtask = "action"
+                    elif "activity" in task_name:
+                        detected_subtask = "activity"
+                    elif "animal" in task_name:
+                        detected_subtask = "animal"
+                    break
+
+            # Additional check for task name in frame
+            if "self" in frame_locals and hasattr(frame_locals["self"], "_config"):
+                config = frame_locals["self"]._config
+                if hasattr(config, "task"):
+                    task_name = config.task
+                    eval_logger.debug(f"Found task name string in process_results: {task_name}")
+                    if "action" in task_name:
+                        detected_subtask = "action"
+                    elif "activity" in task_name:
+                        detected_subtask = "activity"
+                    elif "animal" in task_name:
+                        detected_subtask = "animal"
+
+            frame = frame.f_back
+    except Exception as e:
+        eval_logger.warning(f"Warning: Could not extract task from stack in process_results: {e}")
+
+    # Use the detected subtask
+    subtask = detected_subtask
+
+    # DEBUG: Log the actual model response with clip and question info
+    eval_logger.debug(f"Processing model response for subtask: {subtask}")
+    eval_logger.debug(f"clip_path: {doc.get('clip', 'Unknown')}")
+    eval_logger.debug(f"question_id: {doc.get('id', 'Unknown')}")
+    eval_logger.debug(f"Full response: {repr(response)}")
+
+    # ==== STRICT extraction: only accept the LAST "Final answer: [...]" ====
+    import re, ast, json
+
+    predicted_labels = []
+
+    matches = list(re.finditer(r"Final\s*answer\s*:\s*(\[[^\]]*\])", response, re.IGNORECASE | re.DOTALL))
+    if not matches:
+        eval_logger.error("Strict mode: No valid 'Final answer:' found; returning empty list.")
+    else:
+        raw_list = matches[-1].group(1)  # last occurrence
+        try:
+            predicted_labels = ast.literal_eval(raw_list)  # safe parse
+        except Exception as e1:
+            # JSON-like fallback (single → double quotes)
+            try:
+                predicted_labels = json.loads(raw_list.replace("'", '"'))
+            except Exception as e2:
+                eval_logger.error(f"Strict mode: could not parse Final answer list. " f"literal_eval={e1}; json={e2}")
+                predicted_labels = []
+
+        if not isinstance(predicted_labels, list):
+            eval_logger.error("Strict mode: Final answer parsed but not a list; using empty list.")
+            predicted_labels = []
+        else:
+            # Clean & dedupe while preserving order
+            seen, cleaned = set(), []
+            for item in predicted_labels:
+                s = str(item).strip()
+                if s and s not in seen:
+                    seen.add(s)
+                    cleaned.append(s)
+            predicted_labels = cleaned
+
+    eval_logger.debug(f"Final extracted labels: {predicted_labels}")
+
+    # Compute jaccard score directly since we have both prediction and target
+    def jaccard_index(pred_set, target_set):
+        """Calculate Jaccard index between two sets"""
+        pred_set = set(pred_set) if pred_set else set()
+        target_set = set(target_set) if target_set else set()
+
+        if len(pred_set) == 0 and len(target_set) == 0:
+            return 1.0
+
+        intersection = len(pred_set.intersection(target_set))
+        union = len(pred_set.union(target_set))
+
+        return intersection / union if union > 0 else 0.0
+
+    # Get the target for comparison using nested structure
+    target_labels = []
+    if subtask in doc and "answer" in doc[subtask]:
+        target_labels = doc[subtask]["answer"]
+        eval_logger.debug(f"Extracted target from nested structure: {target_labels}")
+    else:
+        eval_logger.error(f"No answer found for subtask '{subtask}' in document structure")
+        target_labels = []
+
+    if isinstance(target_labels, str):
+        target_labels = [target_labels]
+
+    # Calculate jaccard score
+    jaccard_score = jaccard_index(predicted_labels, target_labels)
+    eval_logger.info(f"INFO: Jaccard score: pred={predicted_labels}, target={target_labels}, score={jaccard_score}")
+    eval_logger.debug(f"INFO: Final result for clip_path={doc.get('clip', 'Unknown')}, question_id={doc.get('id', 'Unknown')}, subtask={subtask}")
+
+    # Save comprehensive results to subtask-specific JSONL files
+    try:
+        doc_id = doc.get("id", "Unknown")
+        clip_path = doc.get("clip", "Unknown")
+
+        # Get the enhanced prompt with frame timestamps from InternVL3's cache
+        original_prompt = ""
+        enhanced_prompt = ""
+        try:
+            # InternVL3 saves enhanced prompts to animalkingdom_enhanced_prompts.json
+            prompt_cache_file = "animalkingdom_enhanced_prompts.json"
+            if os.path.exists(prompt_cache_file):
+                with open(prompt_cache_file, "r", encoding="utf-8") as f:
+                    prompt_cache = json.load(f)
+
+                doc_id_str = str(doc.get("id", "Unknown"))
+
+                # Only use subtask-specific cache; warn if missing
+                if doc_id_str in prompt_cache:
+                    cached_entry = prompt_cache[doc_id_str]
+                    if isinstance(cached_entry, dict) and subtask in cached_entry:
+                        enhanced_prompt = cached_entry[subtask].get("enhanced_prompt_with_timestamps", "")
+                        original_prompt = cached_entry[subtask].get("original_prompt", "")
+                        eval_logger.debug(f"Retrieved enhanced {subtask} prompt from subtask-specific cache for doc_id: {doc_id_str}")
+                        eval_logger.debug(f"Enhanced prompt preview: {enhanced_prompt[:200]}...")
+                    else:
+                        eval_logger.warning(f"No enhanced prompt for subtask '{subtask}' in cache for doc_id: {doc_id_str}. Cache entry keys: {list(cached_entry.keys()) if isinstance(cached_entry, dict) else 'not a dict'}")
+                else:
+                    eval_logger.debug(f"No cached enhanced prompt found for doc_id: {doc_id_str}")
+            else:
+                eval_logger.debug(f"Enhanced prompt cache file not found: {prompt_cache_file}")
+        except Exception as e:
+            eval_logger.debug(f"Could not load InternVL3 enhanced prompt cache: {e}")
+
+        # Fallback to original prompt from document if no enhanced prompt available
+        if not enhanced_prompt and subtask in doc and "prompt" in doc[subtask]:
+            original_prompt = doc[subtask]["prompt"]
+            enhanced_prompt = original_prompt
+            eval_logger.warning(f"No enhanced prompt with timestamps found for subtask {subtask}, doc_id: {doc_id_str}")
+            eval_logger.warning("Using original prompt as fallback - timestamps may be missing")
+            eval_logger.debug(f"Fallback prompt preview: {enhanced_prompt[:200]}...")
+        elif not enhanced_prompt:
+            eval_logger.error(f"Could not find any prompt for subtask {subtask}, doc_id: {doc_id_str}")
+            enhanced_prompt = "ERROR: No prompt found"
+
+        # Log what prompt is being saved to JSONL
+        if enhanced_prompt and enhanced_prompt != "ERROR: No prompt found":
+            if "<video>" in enhanced_prompt:
+                eval_logger.warning(f"Saving ORIGINAL prompt (contains <video>) to JSONL for subtask {subtask}, doc_id: {doc_id_str}")
+            else:
+                eval_logger.info(f"Saving ENHANCED prompt (with timestamps) to JSONL for subtask {subtask}, doc_id: {doc_id_str}")
+            eval_logger.debug(f"Prompt being saved: {enhanced_prompt[:200]}...")
+
+        # Create comprehensive result entry - with jaccard_score included
+        comprehensive_result = {
+            "id": doc_id,
+            "clip": clip_path,
+            "prompt": enhanced_prompt,  # Use enhanced prompt with frame timestamps
+            "full_answer": response,
+            "answer": predicted_labels,
+            "ground_truth": target_labels,
+            "jaccard_score": jaccard_score,
+        }
+
+        # Create model_used_date_time directory structure in results directory
+        import datetime
+
+        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M")
+        model_name = "InternVL3-8B"  # Can be made configurable if needed
+
+        # Use results directory in the lmms-eval repository
+        results_base_dir = os.path.join(os.getcwd(), "results")
+        output_dir = os.path.join(results_base_dir, f"{model_name}_{timestamp}")
+
+        # Create directory if it doesn't exist
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Save to subtask-specific JSONL file in the structured directory
+        subtask_file = os.path.join(output_dir, f"animalkingdom_{subtask}.jsonl")
+
+        # Append to the subtask-specific file
+        with open(subtask_file, "a", encoding="utf-8") as f:
+            f.write(json.dumps(comprehensive_result, ensure_ascii=False) + "\n")
+
+        eval_logger.debug(f"Saved comprehensive result to {subtask_file}: {doc_id}")
+
+    except Exception as e:
+        eval_logger.warning(f"Failed to save comprehensive result: {e}")
+
+    # Return the computed jaccard score
+    return {"jaccard": jaccard_score}
+
+
+# Aggregation function for animalkingdom jaccard results
+def animalkingdom_jaccard_aggregation(items):
+    """Aggregate Jaccard scores by averaging them.
+
+    Args:
+        items (list): A list of Jaccard score dictionaries from individual evaluations.
+
+    Returns:
+        float: The average Jaccard score.
+    """
+    if not items:
+        return 0.0
+
+    def jaccard_index(pred_set, target_set):
+        """Calculate Jaccard index between two sets"""
+        pred_set = set(pred_set) if pred_set else set()
+        target_set = set(target_set) if target_set else set()
+
+        if len(pred_set) == 0 and len(target_set) == 0:
+            return 1.0
+
+        intersection = len(pred_set.intersection(target_set))
+        union = len(pred_set.union(target_set))
+
+        return intersection / union if union > 0 else 0.0
+
+    # The aggregation function receives the results from the metric computation
+    # Each item should be the jaccard score computed by the framework
+    total_jaccard = 0.0
+    count = 0
+
+    for item in items:
+        if isinstance(item, (int, float)):
+            # This is a computed jaccard score
+            total_jaccard += item
+            count += 1
+        elif isinstance(item, list):
+            # This should not happen with the proper setup, but handle it anyway
+            eval_logger.warning(f"Warning: Aggregation received raw list: {item}")
+        else:
+            eval_logger.warning(f"Warning: Unexpected item type in aggregation: {type(item)}, value: {item}")
+
+    return total_jaccard / count if count > 0 else 0.0
diff --git a/lmms_eval/tasks/lemonade/lemonade.yaml b/lmms_eval/tasks/lemonade/lemonade.yaml
new file mode 100644
index 000000000..22b81d02d
--- /dev/null
+++ b/lmms_eval/tasks/lemonade/lemonade.yaml
@@ -0,0 +1,23 @@
+dataset_path: amathislab/LEMONADE
+task: "lemonade"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.lemonade_doc_to_visual
+doc_to_text: !function utils.lemonade_doc_to_text
+doc_to_target: "Correct Answer"
+
+generation_kwargs:
+  max_new_tokens: 128
+  temperature: 0
+  do_sample: false
+
+process_results: !function utils.lemonade_process_results
+metric_list:
+  - metric: acc
+    aggregation: !function utils.lemonade_aggregate_results
+    higher_is_better: true
+
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: "Answer the following multiple-choice question using the given images.\n"
+    post_prompt: "\nRespond only with the letter of the correct answer."
\ No newline at end of file
diff --git a/lmms_eval/tasks/lemonade/utils.py b/lmms_eval/tasks/lemonade/utils.py
new file mode 100644
index 000000000..d4ad7a243
--- /dev/null
+++ b/lmms_eval/tasks/lemonade/utils.py
@@ -0,0 +1,356 @@
+import ast
+import os
+import cv2
+import numpy as np
+import zipfile
+from collections import defaultdict
+from PIL import Image
+from typing import Any, Optional
+from huggingface_hub import hf_hub_download
+
+MAX_NUM_FRAMES = 8
+LEMONADE_ZIP_NAMES = [f"videos_batch_{i}.zip" for i in range(5)]
+
+HF_HOME = os.getenv("HF_HOME", "~/.cache/huggingface/")
+base_cache_dir = os.path.expanduser(HF_HOME)
+videos_dir = os.path.join(base_cache_dir, "videos")
+
+def download_and_extract_lemonade_videos(data_dir: str) -> None:
+    """
+    Download and extract LEMONADE files from Hugging Face into a local data directory.
+
+    Args:
+        data_dir: Directory that stores the files.
+    Returns:
+        None
+    """
+
+    os.makedirs(data_dir, exist_ok=True)
+    os.makedirs(videos_dir, exist_ok=True)
+    print(f"Creating videos directory at {videos_dir}...")
+
+    for zip_name in LEMONADE_ZIP_NAMES:
+        print(f"Downloading {zip_name} from Hugging Face...")
+        zip_path = hf_hub_download(
+            repo_id="amathislab/LEMONADE",
+            filename=zip_name,
+            repo_type="dataset",
+            local_dir=os.path.join(base_cache_dir, "lemonade_zips"),
+            local_dir_use_symlinks=False,
+            resume_download=True,
+        )
+        with zipfile.ZipFile(zip_path, "r") as zf:
+            zf.extractall(videos_dir)
+
+    print("All videos downloaded and extracted successfully.\n")
+
+def load_video(video_file: str, start_frame: int, end_frame: int, max_num_frames: int = MAX_NUM_FRAMES) -> list[Image.Image]:
+    """
+    Args:
+        video_file: Path to the video file.
+        start_frame: Starting frame index.
+        end_frame: Ending frame index.
+        max_num_frames: Number of frames to sample from the video segment.
+    Returns:
+        List of PIL Image objects representing sampled frames
+    """
+
+    cap = cv2.VideoCapture(video_file)
+    try: 
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        start_frame = max(0, start_frame)
+        end_frame = min(end_frame, total_frames - 1)
+        total_valid_frames = end_frame - start_frame + 1
+        num_frames = min(max_num_frames, total_valid_frames)
+        step = total_valid_frames / num_frames
+        frame_indices = [int(start_frame + i * step) for i in range(num_frames)]
+        frames = []
+        for target_idx in frame_indices:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, target_idx)
+            success, frame = cap.read()
+            if not success:
+                continue
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            pil_img = Image.fromarray(frame_rgb).convert("RGB")
+            frames.append(pil_img)
+
+        return frames
+    finally:
+        cap.release()
+
+def parse_options(options: list[str]) -> str:
+    """
+    Format a list of multiple-choice options into a string.
+    The function assigns letters to each option and returns them in a newline-separated string.    
+    
+    Args:
+        options (list[str]): A list of option strings.
+
+    Returns:
+        str: A formatted string with each option on a new line, prefixed by its corresponding letter.
+    """
+
+    option_letters = [chr(ord("A") + i) for i in range(len(options))]
+
+    if all(option.startswith(f"{letter}.") for option, letter in zip(options, option_letters)):
+        return "\n".join(options)
+
+    choices_str = "\n".join([f"{option_letter}. {option}" for option_letter, option in zip(option_letters, options)])
+    return choices_str
+
+
+def lemonade_doc_to_visual(doc: dict[str, Any]) -> list[Image.Image]:
+    """
+    Load video frames for a given entry in the LEMONADE dataset.
+
+    Args:
+        doc: A dictionary representing an entry in the dataset.
+    Returns:
+        frames: List of PIL Image objects representing sampled frames
+    """
+
+    if not os.path.exists(videos_dir) or len(os.listdir(videos_dir)) == 0:
+        print("Videos directory is empty — downloading and extracting...\n")
+        download_and_extract_lemonade_videos(base_cache_dir)
+
+    video_filename = doc["Clip"] + "_hololens.mp4"
+    video_path = os.path.join(videos_dir, video_filename)
+
+    if os.path.exists(video_path):
+        start = int(doc["Start"])
+        end = int(doc["End"])
+        frames = load_video(video_path, start, end, max_num_frames=MAX_NUM_FRAMES)
+    else:
+        raise FileNotFoundError(
+            f"Video file not found: {video_path}. "
+            f"Expected video for clip '{doc['Clip']}' at {video_path}"
+        )
+    return frames
+    
+
+def lemonade_doc_to_text(doc: dict[str, Any], lmms_eval_specific_kwargs: Optional[dict[str, Any]] = None) -> str:
+    """
+    Convert a LEMONADE dataset entry into a formatted text prompt.
+    Args:
+        doc: A dictionary representing an entry in the dataset.
+        lmms_eval_specific_kwargs: Optional dictionary for additional prompt formatting.
+    Returns:
+        str: A formatted prompt string ready for model input
+    """
+
+    if lmms_eval_specific_kwargs is None:
+        lmms_eval_specific_kwargs = {}
+        
+    pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
+    post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
+    
+    question = "Question: " + doc["Question"]
+    parsed_options = parse_options(ast.literal_eval(doc["Answers"]))
+    choices = "Choices:\n" + parsed_options
+
+    return f"{pre_prompt}{question}\n{choices}{post_prompt}"
+
+
+def get_multi_choice_info(options: list[str]) -> tuple[dict[str, str], list[str]]:
+    """
+    Map a list of options to letter labels (A, B, C, ...).
+    
+    Args:
+        options: The set of answer options
+    Returns:
+        tuple[dict[str, str], list[str]]: 
+            - index2ans: Mapping from letters to option text.
+            - all_choices: List of the assigned letters.
+    """
+    
+    if not isinstance(options, list):
+        raise TypeError(f"Expected list of options, got {type(options)}: {options}")
+   
+    start_chr = "A"
+    all_choices = []
+    index2ans = {}
+    for i, option in enumerate(options):
+        index2ans[chr(ord(start_chr) + i)] = option
+        all_choices.append(chr(ord(start_chr) + i))
+
+    return index2ans, all_choices
+
+
+def parse_multi_choice_response(response: str, all_choices: list[str], index2ans: dict[str, str]) -> str:
+    """
+    Parse a model response and return the predicted choice label (e.g., "A", "B", "C", "D"). 
+
+    Args:
+        response (str): The generated response to parse.
+        all_choices (list[str]): The set of valid choice labels.
+        index2ans (dict[str, str]): Mapping from choice labels to their full answer text.
+    Returns:
+        str: The predicted choice label.
+    """
+
+    if response == "API Error":
+        return "API Error"
+
+    if response == "":
+        return "Empty Response"
+
+    for char in [",", ".", "!", "?", ";", ":", "'"]:
+        response = response.strip(char)
+    response = " " + response + " " 
+
+    index_ans = True
+    ans_with_brack = False
+    ans_with_period = False
+    ans_with_colon = False
+    candidates = []
+
+    for choice in all_choices:
+        if f"{choice}." in response:
+            candidates.append(choice)
+            ans_with_period = True
+    for choice in all_choices: 
+        if f"{choice}:" in response:
+            candidates.append(choice)
+            ans_with_colon = True
+    if len(candidates) == 0:
+        for choice in all_choices:
+            if f"({choice})" in response:
+                candidates.append(choice)
+                ans_with_brack = True
+    if len(candidates) == 0:
+        for choice in all_choices: 
+            if f"{choice} " in response:
+                candidates.append(choice)
+    if len(candidates) == 0 and len(response.split()) > 5:
+        for index, ans in index2ans.items():
+            if ans.lower() in response.lower():
+                candidates.append(index)
+                index_ans = False 
+    if len(candidates) == 0:
+        pred_index = "A"
+
+    elif len(candidates) > 1:
+        start_indexes = []
+        if index_ans:
+            if ans_with_period:
+                for can in candidates:
+                    index = response.rfind(f"{can}.")
+                    start_indexes.append(index)
+            elif ans_with_colon:
+                for can in candidates:
+                    index = response.rfind(f"{can}:")
+                    start_indexes.append(index)
+            elif ans_with_brack:
+                for can in candidates:
+                    index = response.rfind(f"({can})")
+                    start_indexes.append(index)
+            else:
+                for can in candidates:
+                    index = response.rfind(f" {can} ")
+                    start_indexes.append(index)
+        else:
+            for can in candidates:
+                index = response.lower().rfind(index2ans[can].lower())
+                start_indexes.append(index)
+        pred_index = candidates[np.argmax(start_indexes)]
+    else:
+        pred_index = candidates[0]
+
+    return pred_index
+
+
+def lemonade_process_results(doc: dict[str, Any], results: list[Any]) -> dict[str, dict]:
+    """
+    Process the results from the model and compute accuracy.
+    
+    Args:
+        doc: A dictionary representing an entry in the dataset.
+        results: List of model outputs.
+    Returns:
+        A dictionary containing accuracy information. 
+    """
+    
+    pred = results[0]
+    index2ans, all_choices = get_multi_choice_info(ast.literal_eval(doc["Answers"]))
+    parsed_pred = parse_multi_choice_response(pred, all_choices, index2ans)
+
+    acc = {
+        "QID": doc["QID"],
+        "category": doc["Category"],
+        "subcategory": doc["Subcategory"],
+        "difficulty": doc["Difficulty"],
+        "answer": doc["Correct Answer"],
+        "parsed_pred": parsed_pred,
+        "original_pred": pred
+    }    
+    return {"acc": acc}
+
+
+def lemonade_aggregate_results(results: list[dict[str, Any]]) -> float:
+    """
+    Aggregate the results from the evaluation.
+    
+    Args:
+        results: List of dicts containing individual evaluation results.
+    Returns:
+        overall_acc: Overall accuracy.
+
+    """
+    def compute_accuracy(grouped_results):
+        acc_dict = {}
+        for key, samples in grouped_results.items():
+            correct = sum([r["parsed_pred"] == r["answer"] for r in samples])
+            total = len(samples)
+            acc = round(correct / total, 5) if total > 0 else 0.0
+            stderr = round(np.sqrt(acc * (1 - acc) / total), 5) if total > 0 else 0.0
+            acc_dict[key] = {
+                "num": total,
+                "acc": acc,
+                "acc_stderr": stderr,
+            }
+        return acc_dict
+
+    qid_results = defaultdict(list)
+    category_results = defaultdict(list)
+    subcategory_results = defaultdict(list)
+    difficulty_results = defaultdict(list)
+
+    valid_results = [r for r in results if r["parsed_pred"] != "API Error"]
+
+    for r in valid_results:
+        qid_results[r["QID"]].append(r)
+        category_results[r["category"]].append(r)
+        subcategory_results[r["subcategory"]].append(r)
+        difficulty_results[r["difficulty"]].append(r)
+
+    qid_acc = compute_accuracy(qid_results)
+    category_acc = compute_accuracy(category_results)
+    subcategory_acc = compute_accuracy(subcategory_results)
+    difficulty_acc = compute_accuracy(difficulty_results)
+
+    total_correct = sum([r["parsed_pred"] == r["answer"] for r in valid_results])
+    total = len(valid_results)
+    overall_acc = round(total_correct / total, 5) if total > 0 else 0.0
+    overall_stderr = round(np.sqrt(overall_acc * (1 - overall_acc) / total), 5) if total > 0 else 0.0
+
+    print("\nResults:")
+
+    print("\nAccuracy per QID:")
+    for k, v in qid_acc.items():
+        print(f"  {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)")
+
+    print("\nAccuracy per Category:")
+    for k, v in category_acc.items():
+        print(f"  {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)")
+
+    print("\nAccuracy per Subcategory:")
+    for k, v in subcategory_acc.items():
+        print(f"  {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)")
+
+    print("\nAccuracy per Difficulty:")
+    for k, v in difficulty_acc.items():
+        print(f"  {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)")
+
+    print(f"\nOverall Accuracy: {overall_acc} ± {overall_stderr} ({total} examples)")
+
+    return overall_acc