From 139a2b4c17f051226a856ab2001957dbc5d8849f Mon Sep 17 00:00:00 2001
From: Matea Tashkovska <matea_tas@yahoo.com>
Date: Sat, 5 Jul 2025 20:59:31 +0000
Subject: [PATCH 01/12] Video loader with caching and download

---
 lmms_eval/tasks/lemonade/lemonade.yaml |  23 ++
 lmms_eval/tasks/lemonade/utils.py      | 282 +++++++++++++++++++++++++
 2 files changed, 305 insertions(+)
 create mode 100644 lmms_eval/tasks/lemonade/lemonade.yaml
 create mode 100644 lmms_eval/tasks/lemonade/utils.py

diff --git a/lmms_eval/tasks/lemonade/lemonade.yaml b/lmms_eval/tasks/lemonade/lemonade.yaml
new file mode 100644
index 000000000..22b81d02d
--- /dev/null
+++ b/lmms_eval/tasks/lemonade/lemonade.yaml
@@ -0,0 +1,23 @@
+dataset_path: amathislab/LEMONADE
+task: "lemonade"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.lemonade_doc_to_visual
+doc_to_text: !function utils.lemonade_doc_to_text
+doc_to_target: "Correct Answer"
+
+generation_kwargs:
+  max_new_tokens: 128
+  temperature: 0
+  do_sample: false
+
+process_results: !function utils.lemonade_process_results
+metric_list:
+  - metric: acc
+    aggregation: !function utils.lemonade_aggregate_results
+    higher_is_better: true
+
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: "Answer the following multiple-choice question using the given images.\n"
+    post_prompt: "\nRespond only with the letter of the correct answer."
\ No newline at end of file
diff --git a/lmms_eval/tasks/lemonade/utils.py b/lmms_eval/tasks/lemonade/utils.py
new file mode 100644
index 000000000..7466c1a1f
--- /dev/null
+++ b/lmms_eval/tasks/lemonade/utils.py
@@ -0,0 +1,282 @@
+import os
+from datasets import load_dataset
+from datetime import datetime
+import cv2
+from collections import defaultdict
+from PIL import Image
+import numpy as np
+import requests
+import zipfile
+from tqdm import tqdm
+import lmms_eval.tasks._task_utils.file_utils as file_utils
+from lmms_eval.utils import load_yaml_config
+from huggingface_hub import hf_hub_download
+
+MAX_NUM_FRAMES = 8
+
+LEMONADE_ZIP_NAMES = [
+    "videos_batch_0.zip",
+    "videos_batch_1.zip",
+    "videos_batch_2.zip",
+    "videos_batch_3.zip",
+    "videos_batch_4.zip",
+]
+
+data_dir = "./data/lemonade"
+
+def download_and_extract_lemonade_videos(data_dir):
+    os.makedirs(data_dir, exist_ok=True)
+    videos_dir = os.path.join(data_dir, "videos")
+    os.makedirs(videos_dir, exist_ok=True)
+
+    for zip_name in LEMONADE_ZIP_NAMES:
+        print(f"Downloading {zip_name} from Hugging Face...")
+        zip_path = hf_hub_download(
+            repo_id="amathislab/LEMONADE",
+            filename=zip_name,
+            repo_type="dataset",
+            cache_dir=os.path.join(data_dir, "cache") 
+        )
+
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            zip_ref.extractall(videos_dir)
+    
+    print("All videos downloaded and extracted successfully.\n")
+
+def load_video(video_file, start_frame, end_frame, max_num_frames=MAX_NUM_FRAMES):
+
+    cap = cv2.VideoCapture(video_file)
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+    start_frame = max(0, start_frame)
+    end_frame = min(end_frame, total_frames - 1)
+    total_valid_frames = end_frame - start_frame + 1
+    num_frames = min(max_num_frames, total_valid_frames)
+
+    step = total_valid_frames / num_frames
+    frame_indices = [int(start_frame + i * step) for i in range(num_frames)]
+
+    frames = []
+    for target_idx in frame_indices:
+        cap.set(cv2.CAP_PROP_POS_FRAMES, target_idx)
+        success, frame = cap.read()
+        if not success:
+            continue
+        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        pil_img = Image.fromarray(frame_rgb).convert("RGB")
+        frames.append(pil_img)
+
+    cap.release()
+    return frames
+
+
+def parse_options(options):
+    option_letters = [chr(ord("A") + i) for i in range(len(options))]
+
+    if all(option.startswith(f"{letter}.") for option, letter in zip(options, option_letters)):
+        return "\n".join(options)
+
+    choices_str = "\n".join([f"{option_letter}. {option}" for option_letter, option in zip(option_letters, options)])
+    return choices_str
+
+
+def lemonade_doc_to_visual(doc):
+    videos_dir = os.path.join(data_dir, "videos")
+
+    if not os.path.exists(videos_dir) or len(os.listdir(videos_dir)) == 0:
+        print("Videos directory is empty — downloading and extracting...\n")
+        download_and_extract_lemonade_videos(data_dir)
+
+    video_filename = doc["Clip"] + "_hololens.mp4"
+
+    video_path = os.path.join(
+        videos_dir,
+        video_filename
+    )
+
+    if os.path.exists(video_path):
+        start = int(doc["Start"])
+        end = int(doc["End"])
+        frames = load_video(video_path, start, end, max_num_frames=MAX_NUM_FRAMES)
+    else:
+        raise FileNotFoundError(f"Video file not found: {video_path}")
+
+    return frames
+    
+
+def lemonade_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    if lmms_eval_specific_kwargs is None:
+        lmms_eval_specific_kwargs = {}
+        
+    pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
+    post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
+    
+    question = "Question: " + doc["Question"]
+    parsed_options = parse_options(eval(doc["Answers"]))
+    choices = "Choices:\n" + parsed_options
+
+    return f"{pre_prompt}{question}\n{choices}{post_prompt}"
+
+
+def get_multi_choice_info(options):
+    """
+    Given the list of options for multiple choice question
+    Return the index2ans and all_choices
+    """
+    assert isinstance(options, list), f"Expected list of options, got {type(options)}: {options}"
+
+    start_chr = "A"
+    all_choices = []
+    index2ans = {}
+    for i, option in enumerate(options):
+        index2ans[chr(ord(start_chr) + i)] = option
+        all_choices.append(chr(ord(start_chr) + i))
+
+    return index2ans, all_choices
+
+
+def parse_multi_choice_response(response, all_choices, index2ans):
+    """
+    Parse the prediction from the generated response.
+    Return the predicted index e.g., A, B, C, D.
+    """
+    if response == "API Error":
+        return "API Error"
+
+    if response == "":
+        return "Empty Response"
+
+    for char in [",", ".", "!", "?", ";", ":", "'"]:
+        response = response.strip(char)
+    response = " " + response + " " 
+
+    index_ans = True
+    ans_with_brack = False
+    ans_with_period = False
+    ans_with_colon = False
+    candidates = []
+
+    for choice in all_choices:
+        if f"{choice}." in response:
+            candidates.append(choice)
+            ans_with_period = True
+    for choice in all_choices: 
+        if f"{choice}:" in response:
+            candidates.append(choice)
+            ans_with_colon = True
+    if len(candidates) == 0:
+        for choice in all_choices:
+            if f"({choice})" in response:
+                candidates.append(choice)
+                ans_with_brack = True
+    if len(candidates) == 0:
+        for choice in all_choices: 
+            if f"{choice} " in response:
+                candidates.append(choice)
+    if len(candidates) == 0 and len(response.split()) > 5:
+        for index, ans in index2ans.items():
+            if ans.lower() in response.lower():
+                candidates.append(index)
+                index_ans = False 
+    if len(candidates) == 0:
+        pred_index = "A"
+
+    elif len(candidates) > 1:
+        start_indexes = []
+        if index_ans:
+            if ans_with_period:
+                for can in candidates:
+                    index = response.rfind(f"{can}.")
+                    start_indexes.append(index)
+            elif ans_with_colon:
+                for can in candidates:
+                    index = response.rfind(f"{can}:")
+                    start_indexes.append(index)
+            elif ans_with_brack:
+                for can in candidates:
+                    index = response.rfind(f"({can})")
+                    start_indexes.append(index)
+            else:
+                for can in candidates:
+                    index = response.rfind(f" {can} ")
+                    start_indexes.append(index)
+        else:
+            for can in candidates:
+                index = response.lower().rfind(index2ans[can].lower())
+                start_indexes.append(index)
+        pred_index = candidates[np.argmax(start_indexes)]
+    else:
+        pred_index = candidates[0]
+
+    return pred_index
+
+
+def lemonade_process_results(doc, results):
+    pred = results[0]
+
+    index2ans, all_choices = get_multi_choice_info(eval(doc["Answers"]))
+    parsed_pred = parse_multi_choice_response(pred, all_choices, index2ans)
+
+    acc = {"QID": doc["QID"], "category": doc["Category"], "subcategory": doc["Subcategory"], "difficulty": doc["Difficulty"], "answer": doc["Correct Answer"], "parsed_pred": parsed_pred, "original_pred": pred}
+    return {"acc": acc}
+
+
+def lemonade_aggregate_results(results):
+    def compute_accuracy(grouped_results):
+        acc_dict = {}
+        for key, samples in grouped_results.items():
+            correct = sum([r["parsed_pred"] == r["answer"] for r in samples])
+            total = len(samples)
+            acc = round(correct / total, 5) if total > 0 else 0.0
+            stderr = round(np.sqrt(acc * (1 - acc) / total), 5) if total > 0 else 0.0
+            acc_dict[key] = {
+                "num": total,
+                "acc": acc,
+                "acc_stderr": stderr,
+            }
+        return acc_dict
+
+    qid_results = defaultdict(list)
+    category_results = defaultdict(list)
+    subcategory_results = defaultdict(list)
+    difficulty_results = defaultdict(list)
+
+    valid_results = [r for r in results if r["parsed_pred"] != "API Error"]
+
+    for r in valid_results:
+        qid_results[r["QID"]].append(r)
+        category_results[r["category"]].append(r)
+        subcategory_results[r["subcategory"]].append(r)
+        difficulty_results[r["difficulty"]].append(r)
+
+    qid_acc = compute_accuracy(qid_results)
+    category_acc = compute_accuracy(category_results)
+    subcategory_acc = compute_accuracy(subcategory_results)
+    difficulty_acc = compute_accuracy(difficulty_results)
+
+    total_correct = sum([r["parsed_pred"] == r["answer"] for r in valid_results])
+    total = len(valid_results)
+    overall_acc = round(total_correct / total, 5) if total > 0 else 0.0
+    overall_stderr = round(np.sqrt(overall_acc * (1 - overall_acc) / total), 5) if total > 0 else 0.0
+
+    print("\nResults:")
+
+    print("\nAccuracy per QID:")
+    for k, v in qid_acc.items():
+        print(f"  {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)")
+
+    print("\nAccuracy per Category:")
+    for k, v in category_acc.items():
+        print(f"  {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)")
+
+    print("\nAccuracy per Subcategory:")
+    for k, v in subcategory_acc.items():
+        print(f"  {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)")
+
+    print("\nAccuracy per Difficulty:")
+    for k, v in difficulty_acc.items():
+        print(f"  {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)")
+
+    print(f"\nOverall Accuracy: {overall_acc} ± {overall_stderr} ({total} examples)")
+        
+    return overall_acc
\ No newline at end of file

From fe24c04dd351139fbce20c5a3402d5c9da857717 Mon Sep 17 00:00:00 2001
From: Matea Tashkovska <matea_tas@yahoo.com>
Date: Sat, 5 Jul 2025 20:59:31 +0000
Subject: [PATCH 02/12] Video loader with caching and download

---
 lmms_eval/tasks/lemonade/lemonade.yaml |  23 ++
 lmms_eval/tasks/lemonade/utils.py      | 282 +++++++++++++++++++++++++
 2 files changed, 305 insertions(+)
 create mode 100644 lmms_eval/tasks/lemonade/lemonade.yaml
 create mode 100644 lmms_eval/tasks/lemonade/utils.py

diff --git a/lmms_eval/tasks/lemonade/lemonade.yaml b/lmms_eval/tasks/lemonade/lemonade.yaml
new file mode 100644
index 000000000..22b81d02d
--- /dev/null
+++ b/lmms_eval/tasks/lemonade/lemonade.yaml
@@ -0,0 +1,23 @@
+dataset_path: amathislab/LEMONADE
+task: "lemonade"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.lemonade_doc_to_visual
+doc_to_text: !function utils.lemonade_doc_to_text
+doc_to_target: "Correct Answer"
+
+generation_kwargs:
+  max_new_tokens: 128
+  temperature: 0
+  do_sample: false
+
+process_results: !function utils.lemonade_process_results
+metric_list:
+  - metric: acc
+    aggregation: !function utils.lemonade_aggregate_results
+    higher_is_better: true
+
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: "Answer the following multiple-choice question using the given images.\n"
+    post_prompt: "\nRespond only with the letter of the correct answer."
\ No newline at end of file
diff --git a/lmms_eval/tasks/lemonade/utils.py b/lmms_eval/tasks/lemonade/utils.py
new file mode 100644
index 000000000..7466c1a1f
--- /dev/null
+++ b/lmms_eval/tasks/lemonade/utils.py
@@ -0,0 +1,282 @@
+import os
+from datasets import load_dataset
+from datetime import datetime
+import cv2
+from collections import defaultdict
+from PIL import Image
+import numpy as np
+import requests
+import zipfile
+from tqdm import tqdm
+import lmms_eval.tasks._task_utils.file_utils as file_utils
+from lmms_eval.utils import load_yaml_config
+from huggingface_hub import hf_hub_download
+
+MAX_NUM_FRAMES = 8
+
+LEMONADE_ZIP_NAMES = [
+    "videos_batch_0.zip",
+    "videos_batch_1.zip",
+    "videos_batch_2.zip",
+    "videos_batch_3.zip",
+    "videos_batch_4.zip",
+]
+
+data_dir = "./data/lemonade"
+
+def download_and_extract_lemonade_videos(data_dir):
+    os.makedirs(data_dir, exist_ok=True)
+    videos_dir = os.path.join(data_dir, "videos")
+    os.makedirs(videos_dir, exist_ok=True)
+
+    for zip_name in LEMONADE_ZIP_NAMES:
+        print(f"Downloading {zip_name} from Hugging Face...")
+        zip_path = hf_hub_download(
+            repo_id="amathislab/LEMONADE",
+            filename=zip_name,
+            repo_type="dataset",
+            cache_dir=os.path.join(data_dir, "cache") 
+        )
+
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            zip_ref.extractall(videos_dir)
+    
+    print("All videos downloaded and extracted successfully.\n")
+
+def load_video(video_file, start_frame, end_frame, max_num_frames=MAX_NUM_FRAMES):
+
+    cap = cv2.VideoCapture(video_file)
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+    start_frame = max(0, start_frame)
+    end_frame = min(end_frame, total_frames - 1)
+    total_valid_frames = end_frame - start_frame + 1
+    num_frames = min(max_num_frames, total_valid_frames)
+
+    step = total_valid_frames / num_frames
+    frame_indices = [int(start_frame + i * step) for i in range(num_frames)]
+
+    frames = []
+    for target_idx in frame_indices:
+        cap.set(cv2.CAP_PROP_POS_FRAMES, target_idx)
+        success, frame = cap.read()
+        if not success:
+            continue
+        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        pil_img = Image.fromarray(frame_rgb).convert("RGB")
+        frames.append(pil_img)
+
+    cap.release()
+    return frames
+
+
+def parse_options(options):
+    option_letters = [chr(ord("A") + i) for i in range(len(options))]
+
+    if all(option.startswith(f"{letter}.") for option, letter in zip(options, option_letters)):
+        return "\n".join(options)
+
+    choices_str = "\n".join([f"{option_letter}. {option}" for option_letter, option in zip(option_letters, options)])
+    return choices_str
+
+
+def lemonade_doc_to_visual(doc):
+    videos_dir = os.path.join(data_dir, "videos")
+
+    if not os.path.exists(videos_dir) or len(os.listdir(videos_dir)) == 0:
+        print("Videos directory is empty — downloading and extracting...\n")
+        download_and_extract_lemonade_videos(data_dir)
+
+    video_filename = doc["Clip"] + "_hololens.mp4"
+
+    video_path = os.path.join(
+        videos_dir,
+        video_filename
+    )
+
+    if os.path.exists(video_path):
+        start = int(doc["Start"])
+        end = int(doc["End"])
+        frames = load_video(video_path, start, end, max_num_frames=MAX_NUM_FRAMES)
+    else:
+        raise FileNotFoundError(f"Video file not found: {video_path}")
+
+    return frames
+    
+
+def lemonade_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    if lmms_eval_specific_kwargs is None:
+        lmms_eval_specific_kwargs = {}
+        
+    pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
+    post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
+    
+    question = "Question: " + doc["Question"]
+    parsed_options = parse_options(eval(doc["Answers"]))
+    choices = "Choices:\n" + parsed_options
+
+    return f"{pre_prompt}{question}\n{choices}{post_prompt}"
+
+
+def get_multi_choice_info(options):
+    """
+    Given the list of options for multiple choice question
+    Return the index2ans and all_choices
+    """
+    assert isinstance(options, list), f"Expected list of options, got {type(options)}: {options}"
+
+    start_chr = "A"
+    all_choices = []
+    index2ans = {}
+    for i, option in enumerate(options):
+        index2ans[chr(ord(start_chr) + i)] = option
+        all_choices.append(chr(ord(start_chr) + i))
+
+    return index2ans, all_choices
+
+
+def parse_multi_choice_response(response, all_choices, index2ans):
+    """
+    Parse the prediction from the generated response.
+    Return the predicted index e.g., A, B, C, D.
+    """
+    if response == "API Error":
+        return "API Error"
+
+    if response == "":
+        return "Empty Response"
+
+    for char in [",", ".", "!", "?", ";", ":", "'"]:
+        response = response.strip(char)
+    response = " " + response + " " 
+
+    index_ans = True
+    ans_with_brack = False
+    ans_with_period = False
+    ans_with_colon = False
+    candidates = []
+
+    for choice in all_choices:
+        if f"{choice}." in response:
+            candidates.append(choice)
+            ans_with_period = True
+    for choice in all_choices: 
+        if f"{choice}:" in response:
+            candidates.append(choice)
+            ans_with_colon = True
+    if len(candidates) == 0:
+        for choice in all_choices:
+            if f"({choice})" in response:
+                candidates.append(choice)
+                ans_with_brack = True
+    if len(candidates) == 0:
+        for choice in all_choices: 
+            if f"{choice} " in response:
+                candidates.append(choice)
+    if len(candidates) == 0 and len(response.split()) > 5:
+        for index, ans in index2ans.items():
+            if ans.lower() in response.lower():
+                candidates.append(index)
+                index_ans = False 
+    if len(candidates) == 0:
+        pred_index = "A"
+
+    elif len(candidates) > 1:
+        start_indexes = []
+        if index_ans:
+            if ans_with_period:
+                for can in candidates:
+                    index = response.rfind(f"{can}.")
+                    start_indexes.append(index)
+            elif ans_with_colon:
+                for can in candidates:
+                    index = response.rfind(f"{can}:")
+                    start_indexes.append(index)
+            elif ans_with_brack:
+                for can in candidates:
+                    index = response.rfind(f"({can})")
+                    start_indexes.append(index)
+            else:
+                for can in candidates:
+                    index = response.rfind(f" {can} ")
+                    start_indexes.append(index)
+        else:
+            for can in candidates:
+                index = response.lower().rfind(index2ans[can].lower())
+                start_indexes.append(index)
+        pred_index = candidates[np.argmax(start_indexes)]
+    else:
+        pred_index = candidates[0]
+
+    return pred_index
+
+
+def lemonade_process_results(doc, results):
+    pred = results[0]
+
+    index2ans, all_choices = get_multi_choice_info(eval(doc["Answers"]))
+    parsed_pred = parse_multi_choice_response(pred, all_choices, index2ans)
+
+    acc = {"QID": doc["QID"], "category": doc["Category"], "subcategory": doc["Subcategory"], "difficulty": doc["Difficulty"], "answer": doc["Correct Answer"], "parsed_pred": parsed_pred, "original_pred": pred}
+    return {"acc": acc}
+
+
+def lemonade_aggregate_results(results):
+    def compute_accuracy(grouped_results):
+        acc_dict = {}
+        for key, samples in grouped_results.items():
+            correct = sum([r["parsed_pred"] == r["answer"] for r in samples])
+            total = len(samples)
+            acc = round(correct / total, 5) if total > 0 else 0.0
+            stderr = round(np.sqrt(acc * (1 - acc) / total), 5) if total > 0 else 0.0
+            acc_dict[key] = {
+                "num": total,
+                "acc": acc,
+                "acc_stderr": stderr,
+            }
+        return acc_dict
+
+    qid_results = defaultdict(list)
+    category_results = defaultdict(list)
+    subcategory_results = defaultdict(list)
+    difficulty_results = defaultdict(list)
+
+    valid_results = [r for r in results if r["parsed_pred"] != "API Error"]
+
+    for r in valid_results:
+        qid_results[r["QID"]].append(r)
+        category_results[r["category"]].append(r)
+        subcategory_results[r["subcategory"]].append(r)
+        difficulty_results[r["difficulty"]].append(r)
+
+    qid_acc = compute_accuracy(qid_results)
+    category_acc = compute_accuracy(category_results)
+    subcategory_acc = compute_accuracy(subcategory_results)
+    difficulty_acc = compute_accuracy(difficulty_results)
+
+    total_correct = sum([r["parsed_pred"] == r["answer"] for r in valid_results])
+    total = len(valid_results)
+    overall_acc = round(total_correct / total, 5) if total > 0 else 0.0
+    overall_stderr = round(np.sqrt(overall_acc * (1 - overall_acc) / total), 5) if total > 0 else 0.0
+
+    print("\nResults:")
+
+    print("\nAccuracy per QID:")
+    for k, v in qid_acc.items():
+        print(f"  {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)")
+
+    print("\nAccuracy per Category:")
+    for k, v in category_acc.items():
+        print(f"  {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)")
+
+    print("\nAccuracy per Subcategory:")
+    for k, v in subcategory_acc.items():
+        print(f"  {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)")
+
+    print("\nAccuracy per Difficulty:")
+    for k, v in difficulty_acc.items():
+        print(f"  {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)")
+
+    print(f"\nOverall Accuracy: {overall_acc} ± {overall_stderr} ({total} examples)")
+        
+    return overall_acc
\ No newline at end of file

From 99af1df198812e22721e5f434e66d80463dcc8a4 Mon Sep 17 00:00:00 2001
From: andy <andy.bonnetto@epfl.ch>
Date: Mon, 18 Aug 2025 16:46:50 +0200
Subject: [PATCH 03/12] black and isort formating

---
 lmms_eval/tasks/lemonade/utils.py | 53 ++++++++++++++-----------------
 1 file changed, 24 insertions(+), 29 deletions(-)

diff --git a/lmms_eval/tasks/lemonade/utils.py b/lmms_eval/tasks/lemonade/utils.py
index 7466c1a1f..c3b2f2c83 100644
--- a/lmms_eval/tasks/lemonade/utils.py
+++ b/lmms_eval/tasks/lemonade/utils.py
@@ -1,16 +1,17 @@
 import os
-from datasets import load_dataset
+import zipfile
+from collections import defaultdict
 from datetime import datetime
+
 import cv2
-from collections import defaultdict
-from PIL import Image
+import lmms_eval.tasks._task_utils.file_utils as file_utils
 import numpy as np
 import requests
-import zipfile
-from tqdm import tqdm
-import lmms_eval.tasks._task_utils.file_utils as file_utils
-from lmms_eval.utils import load_yaml_config
+from datasets import load_dataset
 from huggingface_hub import hf_hub_download
+from lmms_eval.utils import load_yaml_config
+from PIL import Image
+from tqdm import tqdm
 
 MAX_NUM_FRAMES = 8
 
@@ -24,6 +25,7 @@
 
 data_dir = "./data/lemonade"
 
+
 def download_and_extract_lemonade_videos(data_dir):
     os.makedirs(data_dir, exist_ok=True)
     videos_dir = os.path.join(data_dir, "videos")
@@ -31,18 +33,14 @@ def download_and_extract_lemonade_videos(data_dir):
 
     for zip_name in LEMONADE_ZIP_NAMES:
         print(f"Downloading {zip_name} from Hugging Face...")
-        zip_path = hf_hub_download(
-            repo_id="amathislab/LEMONADE",
-            filename=zip_name,
-            repo_type="dataset",
-            cache_dir=os.path.join(data_dir, "cache") 
-        )
-
-        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+        zip_path = hf_hub_download(repo_id="amathislab/LEMONADE", filename=zip_name, repo_type="dataset", cache_dir=os.path.join(data_dir, "cache"))
+
+        with zipfile.ZipFile(zip_path, "r") as zip_ref:
             zip_ref.extractall(videos_dir)
-    
+
     print("All videos downloaded and extracted successfully.\n")
 
+
 def load_video(video_file, start_frame, end_frame, max_num_frames=MAX_NUM_FRAMES):
 
     cap = cv2.VideoCapture(video_file)
@@ -89,10 +87,7 @@ def lemonade_doc_to_visual(doc):
 
     video_filename = doc["Clip"] + "_hololens.mp4"
 
-    video_path = os.path.join(
-        videos_dir,
-        video_filename
-    )
+    video_path = os.path.join(videos_dir, video_filename)
 
     if os.path.exists(video_path):
         start = int(doc["Start"])
@@ -102,15 +97,15 @@ def lemonade_doc_to_visual(doc):
         raise FileNotFoundError(f"Video file not found: {video_path}")
 
     return frames
-    
+
 
 def lemonade_doc_to_text(doc, lmms_eval_specific_kwargs=None):
     if lmms_eval_specific_kwargs is None:
         lmms_eval_specific_kwargs = {}
-        
+
     pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
     post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
-    
+
     question = "Question: " + doc["Question"]
     parsed_options = parse_options(eval(doc["Answers"]))
     choices = "Choices:\n" + parsed_options
@@ -148,7 +143,7 @@ def parse_multi_choice_response(response, all_choices, index2ans):
 
     for char in [",", ".", "!", "?", ";", ":", "'"]:
         response = response.strip(char)
-    response = " " + response + " " 
+    response = " " + response + " "
 
     index_ans = True
     ans_with_brack = False
@@ -160,7 +155,7 @@ def parse_multi_choice_response(response, all_choices, index2ans):
         if f"{choice}." in response:
             candidates.append(choice)
             ans_with_period = True
-    for choice in all_choices: 
+    for choice in all_choices:
         if f"{choice}:" in response:
             candidates.append(choice)
             ans_with_colon = True
@@ -170,14 +165,14 @@ def parse_multi_choice_response(response, all_choices, index2ans):
                 candidates.append(choice)
                 ans_with_brack = True
     if len(candidates) == 0:
-        for choice in all_choices: 
+        for choice in all_choices:
             if f"{choice} " in response:
                 candidates.append(choice)
     if len(candidates) == 0 and len(response.split()) > 5:
         for index, ans in index2ans.items():
             if ans.lower() in response.lower():
                 candidates.append(index)
-                index_ans = False 
+                index_ans = False
     if len(candidates) == 0:
         pred_index = "A"
 
@@ -278,5 +273,5 @@ def compute_accuracy(grouped_results):
         print(f"  {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)")
 
     print(f"\nOverall Accuracy: {overall_acc} ± {overall_stderr} ({total} examples)")
-        
-    return overall_acc
\ No newline at end of file
+
+    return overall_acc

From 34d435fa0f7f7bcd1aae3b1e1db82bc75ec38595 Mon Sep 17 00:00:00 2001
From: andy <andy.bonnetto@epfl.ch>
Date: Mon, 18 Aug 2025 17:01:32 +0200
Subject: [PATCH 04/12] clean imports

---
 lmms_eval/tasks/lemonade/utils.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/lmms_eval/tasks/lemonade/utils.py b/lmms_eval/tasks/lemonade/utils.py
index c3b2f2c83..bb51677b2 100644
--- a/lmms_eval/tasks/lemonade/utils.py
+++ b/lmms_eval/tasks/lemonade/utils.py
@@ -1,17 +1,11 @@
 import os
 import zipfile
 from collections import defaultdict
-from datetime import datetime
 
 import cv2
-import lmms_eval.tasks._task_utils.file_utils as file_utils
 import numpy as np
-import requests
-from datasets import load_dataset
 from huggingface_hub import hf_hub_download
-from lmms_eval.utils import load_yaml_config
 from PIL import Image
-from tqdm import tqdm
 
 MAX_NUM_FRAMES = 8
 

From a9caecd79fa6e353f4760d777c3cb6cbfdadd94d Mon Sep 17 00:00:00 2001
From: Matea Tashkovska <matea_tas@yahoo.com>
Date: Sat, 5 Jul 2025 20:59:31 +0000
Subject: [PATCH 05/12] Video loader with caching and download

---
 lmms_eval/tasks/lemonade/lemonade.yaml |  23 ++
 lmms_eval/tasks/lemonade/utils.py      | 282 +++++++++++++++++++++++++
 2 files changed, 305 insertions(+)
 create mode 100644 lmms_eval/tasks/lemonade/lemonade.yaml
 create mode 100644 lmms_eval/tasks/lemonade/utils.py

diff --git a/lmms_eval/tasks/lemonade/lemonade.yaml b/lmms_eval/tasks/lemonade/lemonade.yaml
new file mode 100644
index 000000000..22b81d02d
--- /dev/null
+++ b/lmms_eval/tasks/lemonade/lemonade.yaml
@@ -0,0 +1,23 @@
+dataset_path: amathislab/LEMONADE
+task: "lemonade"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.lemonade_doc_to_visual
+doc_to_text: !function utils.lemonade_doc_to_text
+doc_to_target: "Correct Answer"
+
+generation_kwargs:
+  max_new_tokens: 128
+  temperature: 0
+  do_sample: false
+
+process_results: !function utils.lemonade_process_results
+metric_list:
+  - metric: acc
+    aggregation: !function utils.lemonade_aggregate_results
+    higher_is_better: true
+
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: "Answer the following multiple-choice question using the given images.\n"
+    post_prompt: "\nRespond only with the letter of the correct answer."
\ No newline at end of file
diff --git a/lmms_eval/tasks/lemonade/utils.py b/lmms_eval/tasks/lemonade/utils.py
new file mode 100644
index 000000000..7466c1a1f
--- /dev/null
+++ b/lmms_eval/tasks/lemonade/utils.py
@@ -0,0 +1,282 @@
+import os
+from datasets import load_dataset
+from datetime import datetime
+import cv2
+from collections import defaultdict
+from PIL import Image
+import numpy as np
+import requests
+import zipfile
+from tqdm import tqdm
+import lmms_eval.tasks._task_utils.file_utils as file_utils
+from lmms_eval.utils import load_yaml_config
+from huggingface_hub import hf_hub_download
+
+MAX_NUM_FRAMES = 8
+
+LEMONADE_ZIP_NAMES = [
+    "videos_batch_0.zip",
+    "videos_batch_1.zip",
+    "videos_batch_2.zip",
+    "videos_batch_3.zip",
+    "videos_batch_4.zip",
+]
+
+data_dir = "./data/lemonade"
+
+def download_and_extract_lemonade_videos(data_dir):
+    os.makedirs(data_dir, exist_ok=True)
+    videos_dir = os.path.join(data_dir, "videos")
+    os.makedirs(videos_dir, exist_ok=True)
+
+    for zip_name in LEMONADE_ZIP_NAMES:
+        print(f"Downloading {zip_name} from Hugging Face...")
+        zip_path = hf_hub_download(
+            repo_id="amathislab/LEMONADE",
+            filename=zip_name,
+            repo_type="dataset",
+            cache_dir=os.path.join(data_dir, "cache") 
+        )
+
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            zip_ref.extractall(videos_dir)
+    
+    print("All videos downloaded and extracted successfully.\n")
+
+def load_video(video_file, start_frame, end_frame, max_num_frames=MAX_NUM_FRAMES):
+
+    cap = cv2.VideoCapture(video_file)
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+    start_frame = max(0, start_frame)
+    end_frame = min(end_frame, total_frames - 1)
+    total_valid_frames = end_frame - start_frame + 1
+    num_frames = min(max_num_frames, total_valid_frames)
+
+    step = total_valid_frames / num_frames
+    frame_indices = [int(start_frame + i * step) for i in range(num_frames)]
+
+    frames = []
+    for target_idx in frame_indices:
+        cap.set(cv2.CAP_PROP_POS_FRAMES, target_idx)
+        success, frame = cap.read()
+        if not success:
+            continue
+        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        pil_img = Image.fromarray(frame_rgb).convert("RGB")
+        frames.append(pil_img)
+
+    cap.release()
+    return frames
+
+
+def parse_options(options):
+    option_letters = [chr(ord("A") + i) for i in range(len(options))]
+
+    if all(option.startswith(f"{letter}.") for option, letter in zip(options, option_letters)):
+        return "\n".join(options)
+
+    choices_str = "\n".join([f"{option_letter}. {option}" for option_letter, option in zip(option_letters, options)])
+    return choices_str
+
+
+def lemonade_doc_to_visual(doc):
+    videos_dir = os.path.join(data_dir, "videos")
+
+    if not os.path.exists(videos_dir) or len(os.listdir(videos_dir)) == 0:
+        print("Videos directory is empty — downloading and extracting...\n")
+        download_and_extract_lemonade_videos(data_dir)
+
+    video_filename = doc["Clip"] + "_hololens.mp4"
+
+    video_path = os.path.join(
+        videos_dir,
+        video_filename
+    )
+
+    if os.path.exists(video_path):
+        start = int(doc["Start"])
+        end = int(doc["End"])
+        frames = load_video(video_path, start, end, max_num_frames=MAX_NUM_FRAMES)
+    else:
+        raise FileNotFoundError(f"Video file not found: {video_path}")
+
+    return frames
+    
+
+def lemonade_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    if lmms_eval_specific_kwargs is None:
+        lmms_eval_specific_kwargs = {}
+        
+    pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
+    post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
+    
+    question = "Question: " + doc["Question"]
+    parsed_options = parse_options(eval(doc["Answers"]))
+    choices = "Choices:\n" + parsed_options
+
+    return f"{pre_prompt}{question}\n{choices}{post_prompt}"
+
+
+def get_multi_choice_info(options):
+    """
+    Given the list of options for multiple choice question
+    Return the index2ans and all_choices
+    """
+    assert isinstance(options, list), f"Expected list of options, got {type(options)}: {options}"
+
+    start_chr = "A"
+    all_choices = []
+    index2ans = {}
+    for i, option in enumerate(options):
+        index2ans[chr(ord(start_chr) + i)] = option
+        all_choices.append(chr(ord(start_chr) + i))
+
+    return index2ans, all_choices
+
+
+def parse_multi_choice_response(response, all_choices, index2ans):
+    """
+    Parse the prediction from the generated response.
+    Return the predicted index e.g., A, B, C, D.
+    """
+    if response == "API Error":
+        return "API Error"
+
+    if response == "":
+        return "Empty Response"
+
+    for char in [",", ".", "!", "?", ";", ":", "'"]:
+        response = response.strip(char)
+    response = " " + response + " " 
+
+    index_ans = True
+    ans_with_brack = False
+    ans_with_period = False
+    ans_with_colon = False
+    candidates = []
+
+    for choice in all_choices:
+        if f"{choice}." in response:
+            candidates.append(choice)
+            ans_with_period = True
+    for choice in all_choices: 
+        if f"{choice}:" in response:
+            candidates.append(choice)
+            ans_with_colon = True
+    if len(candidates) == 0:
+        for choice in all_choices:
+            if f"({choice})" in response:
+                candidates.append(choice)
+                ans_with_brack = True
+    if len(candidates) == 0:
+        for choice in all_choices: 
+            if f"{choice} " in response:
+                candidates.append(choice)
+    if len(candidates) == 0 and len(response.split()) > 5:
+        for index, ans in index2ans.items():
+            if ans.lower() in response.lower():
+                candidates.append(index)
+                index_ans = False 
+    if len(candidates) == 0:
+        pred_index = "A"
+
+    elif len(candidates) > 1:
+        start_indexes = []
+        if index_ans:
+            if ans_with_period:
+                for can in candidates:
+                    index = response.rfind(f"{can}.")
+                    start_indexes.append(index)
+            elif ans_with_colon:
+                for can in candidates:
+                    index = response.rfind(f"{can}:")
+                    start_indexes.append(index)
+            elif ans_with_brack:
+                for can in candidates:
+                    index = response.rfind(f"({can})")
+                    start_indexes.append(index)
+            else:
+                for can in candidates:
+                    index = response.rfind(f" {can} ")
+                    start_indexes.append(index)
+        else:
+            for can in candidates:
+                index = response.lower().rfind(index2ans[can].lower())
+                start_indexes.append(index)
+        pred_index = candidates[np.argmax(start_indexes)]
+    else:
+        pred_index = candidates[0]
+
+    return pred_index
+
+
+def lemonade_process_results(doc, results):
+    pred = results[0]
+
+    index2ans, all_choices = get_multi_choice_info(eval(doc["Answers"]))
+    parsed_pred = parse_multi_choice_response(pred, all_choices, index2ans)
+
+    acc = {"QID": doc["QID"], "category": doc["Category"], "subcategory": doc["Subcategory"], "difficulty": doc["Difficulty"], "answer": doc["Correct Answer"], "parsed_pred": parsed_pred, "original_pred": pred}
+    return {"acc": acc}
+
+
+def lemonade_aggregate_results(results):
+    def compute_accuracy(grouped_results):
+        acc_dict = {}
+        for key, samples in grouped_results.items():
+            correct = sum([r["parsed_pred"] == r["answer"] for r in samples])
+            total = len(samples)
+            acc = round(correct / total, 5) if total > 0 else 0.0
+            stderr = round(np.sqrt(acc * (1 - acc) / total), 5) if total > 0 else 0.0
+            acc_dict[key] = {
+                "num": total,
+                "acc": acc,
+                "acc_stderr": stderr,
+            }
+        return acc_dict
+
+    qid_results = defaultdict(list)
+    category_results = defaultdict(list)
+    subcategory_results = defaultdict(list)
+    difficulty_results = defaultdict(list)
+
+    valid_results = [r for r in results if r["parsed_pred"] != "API Error"]
+
+    for r in valid_results:
+        qid_results[r["QID"]].append(r)
+        category_results[r["category"]].append(r)
+        subcategory_results[r["subcategory"]].append(r)
+        difficulty_results[r["difficulty"]].append(r)
+
+    qid_acc = compute_accuracy(qid_results)
+    category_acc = compute_accuracy(category_results)
+    subcategory_acc = compute_accuracy(subcategory_results)
+    difficulty_acc = compute_accuracy(difficulty_results)
+
+    total_correct = sum([r["parsed_pred"] == r["answer"] for r in valid_results])
+    total = len(valid_results)
+    overall_acc = round(total_correct / total, 5) if total > 0 else 0.0
+    overall_stderr = round(np.sqrt(overall_acc * (1 - overall_acc) / total), 5) if total > 0 else 0.0
+
+    print("\nResults:")
+
+    print("\nAccuracy per QID:")
+    for k, v in qid_acc.items():
+        print(f"  {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)")
+
+    print("\nAccuracy per Category:")
+    for k, v in category_acc.items():
+        print(f"  {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)")
+
+    print("\nAccuracy per Subcategory:")
+    for k, v in subcategory_acc.items():
+        print(f"  {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)")
+
+    print("\nAccuracy per Difficulty:")
+    for k, v in difficulty_acc.items():
+        print(f"  {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)")
+
+    print(f"\nOverall Accuracy: {overall_acc} ± {overall_stderr} ({total} examples)")
+        
+    return overall_acc
\ No newline at end of file

From 87cf67a1dff36312d9b3e880b52772a31b07fb63 Mon Sep 17 00:00:00 2001
From: andy <andy.bonnetto@epfl.ch>
Date: Mon, 18 Aug 2025 16:46:50 +0200
Subject: [PATCH 06/12] black and isort formating

---
 lmms_eval/tasks/lemonade/utils.py | 53 ++++++++++++++-----------------
 1 file changed, 24 insertions(+), 29 deletions(-)

diff --git a/lmms_eval/tasks/lemonade/utils.py b/lmms_eval/tasks/lemonade/utils.py
index 7466c1a1f..c3b2f2c83 100644
--- a/lmms_eval/tasks/lemonade/utils.py
+++ b/lmms_eval/tasks/lemonade/utils.py
@@ -1,16 +1,17 @@
 import os
-from datasets import load_dataset
+import zipfile
+from collections import defaultdict
 from datetime import datetime
+
 import cv2
-from collections import defaultdict
-from PIL import Image
+import lmms_eval.tasks._task_utils.file_utils as file_utils
 import numpy as np
 import requests
-import zipfile
-from tqdm import tqdm
-import lmms_eval.tasks._task_utils.file_utils as file_utils
-from lmms_eval.utils import load_yaml_config
+from datasets import load_dataset
 from huggingface_hub import hf_hub_download
+from lmms_eval.utils import load_yaml_config
+from PIL import Image
+from tqdm import tqdm
 
 MAX_NUM_FRAMES = 8
 
@@ -24,6 +25,7 @@
 
 data_dir = "./data/lemonade"
 
+
 def download_and_extract_lemonade_videos(data_dir):
     os.makedirs(data_dir, exist_ok=True)
     videos_dir = os.path.join(data_dir, "videos")
@@ -31,18 +33,14 @@ def download_and_extract_lemonade_videos(data_dir):
 
     for zip_name in LEMONADE_ZIP_NAMES:
         print(f"Downloading {zip_name} from Hugging Face...")
-        zip_path = hf_hub_download(
-            repo_id="amathislab/LEMONADE",
-            filename=zip_name,
-            repo_type="dataset",
-            cache_dir=os.path.join(data_dir, "cache") 
-        )
-
-        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+        zip_path = hf_hub_download(repo_id="amathislab/LEMONADE", filename=zip_name, repo_type="dataset", cache_dir=os.path.join(data_dir, "cache"))
+
+        with zipfile.ZipFile(zip_path, "r") as zip_ref:
             zip_ref.extractall(videos_dir)
-    
+
     print("All videos downloaded and extracted successfully.\n")
 
+
 def load_video(video_file, start_frame, end_frame, max_num_frames=MAX_NUM_FRAMES):
 
     cap = cv2.VideoCapture(video_file)
@@ -89,10 +87,7 @@ def lemonade_doc_to_visual(doc):
 
     video_filename = doc["Clip"] + "_hololens.mp4"
 
-    video_path = os.path.join(
-        videos_dir,
-        video_filename
-    )
+    video_path = os.path.join(videos_dir, video_filename)
 
     if os.path.exists(video_path):
         start = int(doc["Start"])
@@ -102,15 +97,15 @@ def lemonade_doc_to_visual(doc):
         raise FileNotFoundError(f"Video file not found: {video_path}")
 
     return frames
-    
+
 
 def lemonade_doc_to_text(doc, lmms_eval_specific_kwargs=None):
     if lmms_eval_specific_kwargs is None:
         lmms_eval_specific_kwargs = {}
-        
+
     pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
     post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
-    
+
     question = "Question: " + doc["Question"]
     parsed_options = parse_options(eval(doc["Answers"]))
     choices = "Choices:\n" + parsed_options
@@ -148,7 +143,7 @@ def parse_multi_choice_response(response, all_choices, index2ans):
 
     for char in [",", ".", "!", "?", ";", ":", "'"]:
         response = response.strip(char)
-    response = " " + response + " " 
+    response = " " + response + " "
 
     index_ans = True
     ans_with_brack = False
@@ -160,7 +155,7 @@ def parse_multi_choice_response(response, all_choices, index2ans):
         if f"{choice}." in response:
             candidates.append(choice)
             ans_with_period = True
-    for choice in all_choices: 
+    for choice in all_choices:
         if f"{choice}:" in response:
             candidates.append(choice)
             ans_with_colon = True
@@ -170,14 +165,14 @@ def parse_multi_choice_response(response, all_choices, index2ans):
                 candidates.append(choice)
                 ans_with_brack = True
     if len(candidates) == 0:
-        for choice in all_choices: 
+        for choice in all_choices:
             if f"{choice} " in response:
                 candidates.append(choice)
     if len(candidates) == 0 and len(response.split()) > 5:
         for index, ans in index2ans.items():
             if ans.lower() in response.lower():
                 candidates.append(index)
-                index_ans = False 
+                index_ans = False
     if len(candidates) == 0:
         pred_index = "A"
 
@@ -278,5 +273,5 @@ def compute_accuracy(grouped_results):
         print(f"  {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)")
 
     print(f"\nOverall Accuracy: {overall_acc} ± {overall_stderr} ({total} examples)")
-        
-    return overall_acc
\ No newline at end of file
+
+    return overall_acc

From 0391cd4e24a183008c9678dbdb074793eec1a798 Mon Sep 17 00:00:00 2001
From: andy <andy.bonnetto@epfl.ch>
Date: Mon, 18 Aug 2025 17:01:32 +0200
Subject: [PATCH 07/12] clean imports

---
 lmms_eval/tasks/lemonade/utils.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/lmms_eval/tasks/lemonade/utils.py b/lmms_eval/tasks/lemonade/utils.py
index c3b2f2c83..bb51677b2 100644
--- a/lmms_eval/tasks/lemonade/utils.py
+++ b/lmms_eval/tasks/lemonade/utils.py
@@ -1,17 +1,11 @@
 import os
 import zipfile
 from collections import defaultdict
-from datetime import datetime
 
 import cv2
-import lmms_eval.tasks._task_utils.file_utils as file_utils
 import numpy as np
-import requests
-from datasets import load_dataset
 from huggingface_hub import hf_hub_download
-from lmms_eval.utils import load_yaml_config
 from PIL import Image
-from tqdm import tqdm
 
 MAX_NUM_FRAMES = 8
 

From 060935d4b388087019bfc1be33c7b85e26aaf05b Mon Sep 17 00:00:00 2001
From: Matea Tashkovska <matea_tas@yahoo.com>
Date: Mon, 8 Sep 2025 08:55:34 +0000
Subject: [PATCH 08/12] implement coderabbitai comments

---
 lmms_eval/tasks/lemonade/utils.py | 217 +++++++++++++++++++++---------
 1 file changed, 153 insertions(+), 64 deletions(-)

diff --git a/lmms_eval/tasks/lemonade/utils.py b/lmms_eval/tasks/lemonade/utils.py
index bb51677b2..30ba75241 100644
--- a/lmms_eval/tasks/lemonade/utils.py
+++ b/lmms_eval/tasks/lemonade/utils.py
@@ -1,14 +1,15 @@
+import ast
 import os
-import zipfile
-from collections import defaultdict
-
 import cv2
 import numpy as np
-from huggingface_hub import hf_hub_download
+import yaml
+import zipfile
+from collections import defaultdict
 from PIL import Image
+from typing import Any, Optional
+from huggingface_hub import hf_hub_download
 
 MAX_NUM_FRAMES = 8
-
 LEMONADE_ZIP_NAMES = [
     "videos_batch_0.zip",
     "videos_batch_1.zip",
@@ -16,53 +17,82 @@
     "videos_batch_3.zip",
     "videos_batch_4.zip",
 ]
+DEFAULT_DATA_DIR = "./data/lemonade"
 
-data_dir = "./data/lemonade"
+def download_and_extract_lemonade_videos(data_dir: str) -> None:
+    """
+    Download and extract LEMONADE files from Hugging Face into a local data directory.
 
+    Args:
+        data_dir: Directory that stores the files.
+    Returns:
+        None
+    """
 
-def download_and_extract_lemonade_videos(data_dir):
     os.makedirs(data_dir, exist_ok=True)
     videos_dir = os.path.join(data_dir, "videos")
     os.makedirs(videos_dir, exist_ok=True)
 
     for zip_name in LEMONADE_ZIP_NAMES:
         print(f"Downloading {zip_name} from Hugging Face...")
-        zip_path = hf_hub_download(repo_id="amathislab/LEMONADE", filename=zip_name, repo_type="dataset", cache_dir=os.path.join(data_dir, "cache"))
-
-        with zipfile.ZipFile(zip_path, "r") as zip_ref:
+        zip_path = hf_hub_download(
+            repo_id="amathislab/LEMONADE",
+            filename=zip_name,
+            repo_type="dataset",
+            cache_dir=os.path.join(data_dir, "cache") 
+        )
+
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
             zip_ref.extractall(videos_dir)
-
+    
     print("All videos downloaded and extracted successfully.\n")
 
-
-def load_video(video_file, start_frame, end_frame, max_num_frames=MAX_NUM_FRAMES):
+def load_video(video_file: str, start_frame: int, end_frame: int, max_num_frames: int = MAX_NUM_FRAMES) -> list[Image.Image]:
+    """
+    Args:
+        video_file: Path to the video file.
+        start_frame: Starting frame index.
+        end_frame: Ending frame index.
+        max_num_frames: Number of frames to sample from the video segment.
+    Returns:
+        List of PIL Image objects representing sampled frames
+    """
 
     cap = cv2.VideoCapture(video_file)
-    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-
-    start_frame = max(0, start_frame)
-    end_frame = min(end_frame, total_frames - 1)
-    total_valid_frames = end_frame - start_frame + 1
-    num_frames = min(max_num_frames, total_valid_frames)
-
-    step = total_valid_frames / num_frames
-    frame_indices = [int(start_frame + i * step) for i in range(num_frames)]
-
-    frames = []
-    for target_idx in frame_indices:
-        cap.set(cv2.CAP_PROP_POS_FRAMES, target_idx)
-        success, frame = cap.read()
-        if not success:
-            continue
-        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        pil_img = Image.fromarray(frame_rgb).convert("RGB")
-        frames.append(pil_img)
-
-    cap.release()
-    return frames
-
+    try: 
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        start_frame = max(0, start_frame)
+        end_frame = min(end_frame, total_frames - 1)
+        total_valid_frames = end_frame - start_frame + 1
+        num_frames = min(max_num_frames, total_valid_frames)
+        step = total_valid_frames / num_frames
+        frame_indices = [int(start_frame + i * step) for i in range(num_frames)]
+        frames = []
+        for target_idx in frame_indices:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, target_idx)
+            success, frame = cap.read()
+            if not success:
+                continue
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            pil_img = Image.fromarray(frame_rgb).convert("RGB")
+            frames.append(pil_img)
+
+        return frames
+    finally:
+        cap.release()
+
+def parse_options(options: list[str]) -> str:
+    """
+    Format a list of multiple-choice options into a string.
+    The function assigns letters to each option and returns them in a newline-separated string.    
+    
+    Args:
+        options (list[str]): A list of option strings.
+
+    Returns:
+        str: A formatted string with each option on a new line, prefixed by its corresponding letter.
+    """
 
-def parse_options(options):
     option_letters = [chr(ord("A") + i) for i in range(len(options))]
 
     if all(option.startswith(f"{letter}.") for option, letter in zip(options, option_letters)):
@@ -72,15 +102,22 @@ def parse_options(options):
     return choices_str
 
 
-def lemonade_doc_to_visual(doc):
-    videos_dir = os.path.join(data_dir, "videos")
+def lemonade_doc_to_visual(doc: dict[str, Any]) -> list[Image.Image]:
+    """
+    Load video frames for a given entry in the LEMONADE dataset.
+
+    Args:
+        doc: A dictionary representing an entry in the dataset.
+    Returns:
+        frames: List of PIL Image objects representing sampled frames
+    """
 
+    videos_dir = os.path.join(DEFAULT_DATA_DIR, "videos")
     if not os.path.exists(videos_dir) or len(os.listdir(videos_dir)) == 0:
         print("Videos directory is empty — downloading and extracting...\n")
-        download_and_extract_lemonade_videos(data_dir)
+        download_and_extract_lemonade_videos(DEFAULT_DATA_DIR)
 
     video_filename = doc["Clip"] + "_hololens.mp4"
-
     video_path = os.path.join(videos_dir, video_filename)
 
     if os.path.exists(video_path):
@@ -88,32 +125,51 @@ def lemonade_doc_to_visual(doc):
         end = int(doc["End"])
         frames = load_video(video_path, start, end, max_num_frames=MAX_NUM_FRAMES)
     else:
-        raise FileNotFoundError(f"Video file not found: {video_path}")
-
+        raise FileNotFoundError(
+            f"Video file not found: {video_path}. "
+            f"Expected video for clip '{doc['Clip']}' at {video_path}"
+        )
     return frames
+    
 
+def lemonade_doc_to_text(doc: dict[str, Any], lmms_eval_specific_kwargs: Optional[dict[str, Any]] = None) -> str:
+    """
+    Convert a LEMONADE dataset entry into a formatted text prompt.
+    Args:
+        doc: A dictionary representing an entry in the dataset.
+        lmms_eval_specific_kwargs: Optional dictionary for additional prompt formatting.
+    Returns:
+        str: A formatted prompt string ready for model input
+    """
 
-def lemonade_doc_to_text(doc, lmms_eval_specific_kwargs=None):
     if lmms_eval_specific_kwargs is None:
         lmms_eval_specific_kwargs = {}
-
+        
     pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
     post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
-
+    
     question = "Question: " + doc["Question"]
-    parsed_options = parse_options(eval(doc["Answers"]))
+    parsed_options = parse_options(ast.literal_eval(doc["Answers"]))
     choices = "Choices:\n" + parsed_options
 
     return f"{pre_prompt}{question}\n{choices}{post_prompt}"
 
 
-def get_multi_choice_info(options):
+def get_multi_choice_info(options: list[str]) -> tuple[dict[str, str], list[str]]:
     """
-    Given the list of options for multiple choice question
-    Return the index2ans and all_choices
+    Map a list of options to letter labels (A, B, C, ...).
+    
+    Args:
+        options: The set of answer options
+    Returns:
+        tuple[dict[str, str], list[str]]: 
+            - index2ans: Mapping from letters to option text.
+            - all_choices: List of the assigned letters.
     """
-    assert isinstance(options, list), f"Expected list of options, got {type(options)}: {options}"
-
+    
+    if not isinstance(options, list):
+        raise TypeError(f"Expected list of options, got {type(options)}: {options}")
+   
     start_chr = "A"
     all_choices = []
     index2ans = {}
@@ -124,11 +180,18 @@ def get_multi_choice_info(options):
     return index2ans, all_choices
 
 
-def parse_multi_choice_response(response, all_choices, index2ans):
+def parse_multi_choice_response(response: str, all_choices: list[str], index2ans: dict[str, str]) -> str:
     """
-    Parse the prediction from the generated response.
-    Return the predicted index e.g., A, B, C, D.
+    Parse a model response and return the predicted choice label (e.g., "A", "B", "C", "D"). 
+
+    Args:
+        response (str): The generated response to parse.
+        all_choices (list[str]): The set of valid choice labels.
+        index2ans (dict[str, str]): Mapping from choice labels to their full answer text.
+    Returns:
+        str: The predicted choice label.
     """
+
     if response == "API Error":
         return "API Error"
 
@@ -137,7 +200,7 @@ def parse_multi_choice_response(response, all_choices, index2ans):
 
     for char in [",", ".", "!", "?", ";", ":", "'"]:
         response = response.strip(char)
-    response = " " + response + " "
+    response = " " + response + " " 
 
     index_ans = True
     ans_with_brack = False
@@ -149,7 +212,7 @@ def parse_multi_choice_response(response, all_choices, index2ans):
         if f"{choice}." in response:
             candidates.append(choice)
             ans_with_period = True
-    for choice in all_choices:
+    for choice in all_choices: 
         if f"{choice}:" in response:
             candidates.append(choice)
             ans_with_colon = True
@@ -159,14 +222,14 @@ def parse_multi_choice_response(response, all_choices, index2ans):
                 candidates.append(choice)
                 ans_with_brack = True
     if len(candidates) == 0:
-        for choice in all_choices:
+        for choice in all_choices: 
             if f"{choice} " in response:
                 candidates.append(choice)
     if len(candidates) == 0 and len(response.split()) > 5:
         for index, ans in index2ans.items():
             if ans.lower() in response.lower():
                 candidates.append(index)
-                index_ans = False
+                index_ans = False 
     if len(candidates) == 0:
         pred_index = "A"
 
@@ -200,17 +263,43 @@ def parse_multi_choice_response(response, all_choices, index2ans):
     return pred_index
 
 
-def lemonade_process_results(doc, results):
+def lemonade_process_results(doc: dict[str, Any], results: list[Any]) -> dict[str, dict]:
+    """
+    Process the results from the model and compute accuracy.
+    
+    Args:
+        doc: A dictionary representing an entry in the dataset.
+        results: List of model outputs.
+    Returns:
+        A dictionary containing accuracy information. 
+    """
+    
     pred = results[0]
-
-    index2ans, all_choices = get_multi_choice_info(eval(doc["Answers"]))
+    index2ans, all_choices = get_multi_choice_info(ast.literal_eval(doc["Answers"]))
     parsed_pred = parse_multi_choice_response(pred, all_choices, index2ans)
 
-    acc = {"QID": doc["QID"], "category": doc["Category"], "subcategory": doc["Subcategory"], "difficulty": doc["Difficulty"], "answer": doc["Correct Answer"], "parsed_pred": parsed_pred, "original_pred": pred}
+    acc = {
+        "QID": doc["QID"],
+        "category": doc["Category"],
+        "subcategory": doc["Subcategory"],
+        "difficulty": doc["Difficulty"],
+        "answer": doc["Correct Answer"],
+        "parsed_pred": parsed_pred,
+        "original_pred": pred
+    }    
     return {"acc": acc}
 
 
-def lemonade_aggregate_results(results):
+def lemonade_aggregate_results(results: list[dict[str, Any]]) -> float:
+    """
+    Aggregate the results from the evaluation.
+    
+    Args:
+        results: List of dicts containing individual evaluation results.
+    Returns:
+        overall_acc: Overall accuracy.
+
+    """
     def compute_accuracy(grouped_results):
         acc_dict = {}
         for key, samples in grouped_results.items():

From 50d70dae880005507e4d13d9f77446fb276b6cb0 Mon Sep 17 00:00:00 2001
From: Matea Tashkovska <matea_tas@yahoo.com>
Date: Wed, 10 Sep 2025 08:30:31 +0000
Subject: [PATCH 09/12] download data in cache

---
 lmms_eval/tasks/lemonade/utils.py | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/lmms_eval/tasks/lemonade/utils.py b/lmms_eval/tasks/lemonade/utils.py
index 30ba75241..d4ad7a243 100644
--- a/lmms_eval/tasks/lemonade/utils.py
+++ b/lmms_eval/tasks/lemonade/utils.py
@@ -2,7 +2,6 @@
 import os
 import cv2
 import numpy as np
-import yaml
 import zipfile
 from collections import defaultdict
 from PIL import Image
@@ -10,14 +9,11 @@
 from huggingface_hub import hf_hub_download
 
 MAX_NUM_FRAMES = 8
-LEMONADE_ZIP_NAMES = [
-    "videos_batch_0.zip",
-    "videos_batch_1.zip",
-    "videos_batch_2.zip",
-    "videos_batch_3.zip",
-    "videos_batch_4.zip",
-]
-DEFAULT_DATA_DIR = "./data/lemonade"
+LEMONADE_ZIP_NAMES = [f"videos_batch_{i}.zip" for i in range(5)]
+
+HF_HOME = os.getenv("HF_HOME", "~/.cache/huggingface/")
+base_cache_dir = os.path.expanduser(HF_HOME)
+videos_dir = os.path.join(base_cache_dir, "videos")
 
 def download_and_extract_lemonade_videos(data_dir: str) -> None:
     """
@@ -30,8 +26,8 @@ def download_and_extract_lemonade_videos(data_dir: str) -> None:
     """
 
     os.makedirs(data_dir, exist_ok=True)
-    videos_dir = os.path.join(data_dir, "videos")
     os.makedirs(videos_dir, exist_ok=True)
+    print(f"Creating videos directory at {videos_dir}...")
 
     for zip_name in LEMONADE_ZIP_NAMES:
         print(f"Downloading {zip_name} from Hugging Face...")
@@ -39,12 +35,13 @@ def download_and_extract_lemonade_videos(data_dir: str) -> None:
             repo_id="amathislab/LEMONADE",
             filename=zip_name,
             repo_type="dataset",
-            cache_dir=os.path.join(data_dir, "cache") 
+            local_dir=os.path.join(base_cache_dir, "lemonade_zips"),
+            local_dir_use_symlinks=False,
+            resume_download=True,
         )
+        with zipfile.ZipFile(zip_path, "r") as zf:
+            zf.extractall(videos_dir)
 
-        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-            zip_ref.extractall(videos_dir)
-    
     print("All videos downloaded and extracted successfully.\n")
 
 def load_video(video_file: str, start_frame: int, end_frame: int, max_num_frames: int = MAX_NUM_FRAMES) -> list[Image.Image]:
@@ -112,10 +109,9 @@ def lemonade_doc_to_visual(doc: dict[str, Any]) -> list[Image.Image]:
         frames: List of PIL Image objects representing sampled frames
     """
 
-    videos_dir = os.path.join(DEFAULT_DATA_DIR, "videos")
     if not os.path.exists(videos_dir) or len(os.listdir(videos_dir)) == 0:
         print("Videos directory is empty — downloading and extracting...\n")
-        download_and_extract_lemonade_videos(DEFAULT_DATA_DIR)
+        download_and_extract_lemonade_videos(base_cache_dir)
 
     video_filename = doc["Clip"] + "_hololens.mp4"
     video_path = os.path.join(videos_dir, video_filename)

From 615004126e92239f0921544258b84e67d7fc9385 Mon Sep 17 00:00:00 2001
From: Matea Tashkovska <matea_tas@yahoo.com>
Date: Sat, 27 Sep 2025 11:25:50 +0000
Subject: [PATCH 10/12] add README for lemonade

---
 lmms_eval/tasks/lemonade/README.md | 45 ++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 lmms_eval/tasks/lemonade/README.md

diff --git a/lmms_eval/tasks/lemonade/README.md b/lmms_eval/tasks/lemonade/README.md
new file mode 100644
index 000000000..518379188
--- /dev/null
+++ b/lmms_eval/tasks/lemonade/README.md
@@ -0,0 +1,45 @@
+# LEMONADE
+
+## Task Description  
+
+**LEMONADE** (Language models Evaluation of MOtion aNd Action-Driven Enquiries) is a QA benchmark extracted from the **EPFL-Smart-Kitchen-30** dataset (see [arXiv](https://arxiv.org/abs/2506.01608)). It consists of **36,521 closed-ended QA pairs** linked to egocentric video clips.  
+
+Questions are organized into three groups and six subcategories:  
+
+- **Behavior Understanding**  
+  - *Perception*: recognizing perceived actions  
+  - *Reasoning*: reasoning over unseen behaviors  
+- **Long-term Understanding**  
+  - *Summarization*: summarizing over longer clips  
+  - *Session Properties*: inferring session-level information  
+- **Motion & Biomechanics**  
+  - *Physical Attributes*: inferring hand shapes, joint angles, etc.  
+  - *Kinematics*: inferring trajectory velocities  
+
+The benchmark was evaluated using **`lmms-eval`** in the associated publication.  
+
+
+## Implementation  
+
+- **utils.py**: Handles data loading from Hugging Face, video loading, answer parsing, and metric evaluation.  
+- **lemonade.yaml**: Contains the default prompts and evaluation settings.
+
+When running LEMONADE through `lmms-eval`, the data is automatically downloaded. For direct dataset access, please refer to [Hugging Face](https://huggingface.co/datasets/amathislab/LEMONADE) or [Zenodo](https://zenodo.org/records/15535461).  
+
+Performance is evaluated in terms of accuracy against the ground truth, with results reported overall as well as per category and subcategory.
+
+## Citation  
+
+If you use **LEMONADE**, please cite:  
+
+```bibtex
+@misc{bonnetto2025epflsmartkitchen,
+      title={EPFL-Smart-Kitchen-30: Densely annotated cooking dataset with 3D kinematics to challenge video and language models}, 
+      author={Andy Bonnetto and Haozhe Qi and Franklin Leong and Matea Tashkovska and Mahdi Rad and Solaiman Shokur and Friedhelm Hummel and Silvestro Micera and Marc Pollefeys and Alexander Mathis},
+      year={2025},
+      eprint={2506.01608},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2506.01608}, 
+}
+```
\ No newline at end of file

From e31fc8ee7c80d7c8c21c4e3a8ee1808d7115c1b4 Mon Sep 17 00:00:00 2001
From: Matea Tashkovska <matea_tas@yahoo.com>
Date: Sat, 27 Sep 2025 13:49:24 +0000
Subject: [PATCH 11/12] remove custom download def, move max_num_frames to
 config

---
 lmms_eval/tasks/lemonade/lemonade.yaml |  7 +++-
 lmms_eval/tasks/lemonade/utils.py      | 51 ++++++++------------------
 2 files changed, 21 insertions(+), 37 deletions(-)

diff --git a/lmms_eval/tasks/lemonade/lemonade.yaml b/lmms_eval/tasks/lemonade/lemonade.yaml
index 22b81d02d..e4263faf5 100644
--- a/lmms_eval/tasks/lemonade/lemonade.yaml
+++ b/lmms_eval/tasks/lemonade/lemonade.yaml
@@ -1,4 +1,8 @@
 dataset_path: amathislab/LEMONADE
+dataset_kwargs:
+  video: true
+  cache_dir: lemonade_data
+  force_unzip: true
 task: "lemonade"
 test_split: test
 output_type: generate_until
@@ -20,4 +24,5 @@ metric_list:
 lmms_eval_specific_kwargs:
   default:
     pre_prompt: "Answer the following multiple-choice question using the given images.\n"
-    post_prompt: "\nRespond only with the letter of the correct answer."
\ No newline at end of file
+    post_prompt: "\nRespond only with the letter of the correct answer."
+  max_num_frames: 8
\ No newline at end of file
diff --git a/lmms_eval/tasks/lemonade/utils.py b/lmms_eval/tasks/lemonade/utils.py
index d4ad7a243..ad43fc05d 100644
--- a/lmms_eval/tasks/lemonade/utils.py
+++ b/lmms_eval/tasks/lemonade/utils.py
@@ -3,48 +3,31 @@
 import cv2
 import numpy as np
 import zipfile
+import yaml
 from collections import defaultdict
+from pathlib import Path
 from PIL import Image
 from typing import Any, Optional
 from huggingface_hub import hf_hub_download
 
-MAX_NUM_FRAMES = 8
-LEMONADE_ZIP_NAMES = [f"videos_batch_{i}.zip" for i in range(5)]
+with open(Path(__file__).parent / "lemonade.yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for line in raw_data:
+        if "!function" not in line:
+            safe_data.append(line)
+
+    config = yaml.safe_load("".join(safe_data))
 
 HF_HOME = os.getenv("HF_HOME", "~/.cache/huggingface/")
 base_cache_dir = os.path.expanduser(HF_HOME)
-videos_dir = os.path.join(base_cache_dir, "videos")
+cache_dir = config["dataset_kwargs"]["cache_dir"]
+videos_dir = os.path.join(base_cache_dir, cache_dir)
 
-def download_and_extract_lemonade_videos(data_dir: str) -> None:
-    """
-    Download and extract LEMONADE files from Hugging Face into a local data directory.
+max_num_frames = config.get("lmms_eval_specific_kwargs", {}).get("max_num_frames", 8)
 
-    Args:
-        data_dir: Directory that stores the files.
-    Returns:
-        None
-    """
 
-    os.makedirs(data_dir, exist_ok=True)
-    os.makedirs(videos_dir, exist_ok=True)
-    print(f"Creating videos directory at {videos_dir}...")
-
-    for zip_name in LEMONADE_ZIP_NAMES:
-        print(f"Downloading {zip_name} from Hugging Face...")
-        zip_path = hf_hub_download(
-            repo_id="amathislab/LEMONADE",
-            filename=zip_name,
-            repo_type="dataset",
-            local_dir=os.path.join(base_cache_dir, "lemonade_zips"),
-            local_dir_use_symlinks=False,
-            resume_download=True,
-        )
-        with zipfile.ZipFile(zip_path, "r") as zf:
-            zf.extractall(videos_dir)
-
-    print("All videos downloaded and extracted successfully.\n")
-
-def load_video(video_file: str, start_frame: int, end_frame: int, max_num_frames: int = MAX_NUM_FRAMES) -> list[Image.Image]:
+def load_video(video_file: str, start_frame: int, end_frame: int, max_num_frames: int = max_num_frames) -> list[Image.Image]:
     """
     Args:
         video_file: Path to the video file.
@@ -109,17 +92,13 @@ def lemonade_doc_to_visual(doc: dict[str, Any]) -> list[Image.Image]:
         frames: List of PIL Image objects representing sampled frames
     """
 
-    if not os.path.exists(videos_dir) or len(os.listdir(videos_dir)) == 0:
-        print("Videos directory is empty — downloading and extracting...\n")
-        download_and_extract_lemonade_videos(base_cache_dir)
-
     video_filename = doc["Clip"] + "_hololens.mp4"
     video_path = os.path.join(videos_dir, video_filename)
 
     if os.path.exists(video_path):
         start = int(doc["Start"])
         end = int(doc["End"])
-        frames = load_video(video_path, start, end, max_num_frames=MAX_NUM_FRAMES)
+        frames = load_video(video_path, start, end, max_num_frames=max_num_frames)
     else:
         raise FileNotFoundError(
             f"Video file not found: {video_path}. "

From e891bad9c6b5da481b5aed8e0b4315f0e069e869 Mon Sep 17 00:00:00 2001
From: Matea Tashkovska <matea_tas@yahoo.com>
Date: Sat, 27 Sep 2025 13:52:42 +0000
Subject: [PATCH 12/12] add lemonade to current_tasks

---
 docs/current_tasks.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/current_tasks.md b/docs/current_tasks.md
index 98fe0e319..5d48c4302 100644
--- a/docs/current_tasks.md
+++ b/docs/current_tasks.md
@@ -245,6 +245,7 @@ python -m lmms_eval --tasks list_with_num
   - egoschema_mcppl
   - egoschema_subset_mcppl
   - egoschema_subset
+- [LEMONADE](https://huggingface.co/datasets/amathislab/LEMONADE) (lemonade)
 - [LongVideoBench](https://github.com/longvideobench/LongVideoBench)
 - [MovieChat](https://github.com/rese1f/MovieChat) (moviechat)
   - Global Mode for entire video (moviechat_global)