From 139a2b4c17f051226a856ab2001957dbc5d8849f Mon Sep 17 00:00:00 2001 From: Matea Tashkovska Date: Sat, 5 Jul 2025 20:59:31 +0000 Subject: [PATCH 01/12] Video loader with caching and download --- lmms_eval/tasks/lemonade/lemonade.yaml | 23 ++ lmms_eval/tasks/lemonade/utils.py | 282 +++++++++++++++++++++++++ 2 files changed, 305 insertions(+) create mode 100644 lmms_eval/tasks/lemonade/lemonade.yaml create mode 100644 lmms_eval/tasks/lemonade/utils.py diff --git a/lmms_eval/tasks/lemonade/lemonade.yaml b/lmms_eval/tasks/lemonade/lemonade.yaml new file mode 100644 index 000000000..22b81d02d --- /dev/null +++ b/lmms_eval/tasks/lemonade/lemonade.yaml @@ -0,0 +1,23 @@ +dataset_path: amathislab/LEMONADE +task: "lemonade" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.lemonade_doc_to_visual +doc_to_text: !function utils.lemonade_doc_to_text +doc_to_target: "Correct Answer" + +generation_kwargs: + max_new_tokens: 128 + temperature: 0 + do_sample: false + +process_results: !function utils.lemonade_process_results +metric_list: + - metric: acc + aggregation: !function utils.lemonade_aggregate_results + higher_is_better: true + +lmms_eval_specific_kwargs: + default: + pre_prompt: "Answer the following multiple-choice question using the given images.\n" + post_prompt: "\nRespond only with the letter of the correct answer." \ No newline at end of file diff --git a/lmms_eval/tasks/lemonade/utils.py b/lmms_eval/tasks/lemonade/utils.py new file mode 100644 index 000000000..7466c1a1f --- /dev/null +++ b/lmms_eval/tasks/lemonade/utils.py @@ -0,0 +1,282 @@ +import os +from datasets import load_dataset +from datetime import datetime +import cv2 +from collections import defaultdict +from PIL import Image +import numpy as np +import requests +import zipfile +from tqdm import tqdm +import lmms_eval.tasks._task_utils.file_utils as file_utils +from lmms_eval.utils import load_yaml_config +from huggingface_hub import hf_hub_download + +MAX_NUM_FRAMES = 8 + +LEMONADE_ZIP_NAMES = [ + "videos_batch_0.zip", + "videos_batch_1.zip", + "videos_batch_2.zip", + "videos_batch_3.zip", + "videos_batch_4.zip", +] + +data_dir = "./data/lemonade" + +def download_and_extract_lemonade_videos(data_dir): + os.makedirs(data_dir, exist_ok=True) + videos_dir = os.path.join(data_dir, "videos") + os.makedirs(videos_dir, exist_ok=True) + + for zip_name in LEMONADE_ZIP_NAMES: + print(f"Downloading {zip_name} from Hugging Face...") + zip_path = hf_hub_download( + repo_id="amathislab/LEMONADE", + filename=zip_name, + repo_type="dataset", + cache_dir=os.path.join(data_dir, "cache") + ) + + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + zip_ref.extractall(videos_dir) + + print("All videos downloaded and extracted successfully.\n") + +def load_video(video_file, start_frame, end_frame, max_num_frames=MAX_NUM_FRAMES): + + cap = cv2.VideoCapture(video_file) + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + + start_frame = max(0, start_frame) + end_frame = min(end_frame, total_frames - 1) + total_valid_frames = end_frame - start_frame + 1 + num_frames = min(max_num_frames, total_valid_frames) + + step = total_valid_frames / num_frames + frame_indices = [int(start_frame + i * step) for i in range(num_frames)] + + frames = [] + for target_idx in frame_indices: + cap.set(cv2.CAP_PROP_POS_FRAMES, target_idx) + success, frame = cap.read() + if not success: + continue + frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + pil_img = Image.fromarray(frame_rgb).convert("RGB") + frames.append(pil_img) + + cap.release() + return frames + + +def parse_options(options): + option_letters = [chr(ord("A") + i) for i in range(len(options))] + + if all(option.startswith(f"{letter}.") for option, letter in zip(options, option_letters)): + return "\n".join(options) + + choices_str = "\n".join([f"{option_letter}. {option}" for option_letter, option in zip(option_letters, options)]) + return choices_str + + +def lemonade_doc_to_visual(doc): + videos_dir = os.path.join(data_dir, "videos") + + if not os.path.exists(videos_dir) or len(os.listdir(videos_dir)) == 0: + print("Videos directory is empty — downloading and extracting...\n") + download_and_extract_lemonade_videos(data_dir) + + video_filename = doc["Clip"] + "_hololens.mp4" + + video_path = os.path.join( + videos_dir, + video_filename + ) + + if os.path.exists(video_path): + start = int(doc["Start"]) + end = int(doc["End"]) + frames = load_video(video_path, start, end, max_num_frames=MAX_NUM_FRAMES) + else: + raise FileNotFoundError(f"Video file not found: {video_path}") + + return frames + + +def lemonade_doc_to_text(doc, lmms_eval_specific_kwargs=None): + if lmms_eval_specific_kwargs is None: + lmms_eval_specific_kwargs = {} + + pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "") + post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "") + + question = "Question: " + doc["Question"] + parsed_options = parse_options(eval(doc["Answers"])) + choices = "Choices:\n" + parsed_options + + return f"{pre_prompt}{question}\n{choices}{post_prompt}" + + +def get_multi_choice_info(options): + """ + Given the list of options for multiple choice question + Return the index2ans and all_choices + """ + assert isinstance(options, list), f"Expected list of options, got {type(options)}: {options}" + + start_chr = "A" + all_choices = [] + index2ans = {} + for i, option in enumerate(options): + index2ans[chr(ord(start_chr) + i)] = option + all_choices.append(chr(ord(start_chr) + i)) + + return index2ans, all_choices + + +def parse_multi_choice_response(response, all_choices, index2ans): + """ + Parse the prediction from the generated response. + Return the predicted index e.g., A, B, C, D. + """ + if response == "API Error": + return "API Error" + + if response == "": + return "Empty Response" + + for char in [",", ".", "!", "?", ";", ":", "'"]: + response = response.strip(char) + response = " " + response + " " + + index_ans = True + ans_with_brack = False + ans_with_period = False + ans_with_colon = False + candidates = [] + + for choice in all_choices: + if f"{choice}." in response: + candidates.append(choice) + ans_with_period = True + for choice in all_choices: + if f"{choice}:" in response: + candidates.append(choice) + ans_with_colon = True + if len(candidates) == 0: + for choice in all_choices: + if f"({choice})" in response: + candidates.append(choice) + ans_with_brack = True + if len(candidates) == 0: + for choice in all_choices: + if f"{choice} " in response: + candidates.append(choice) + if len(candidates) == 0 and len(response.split()) > 5: + for index, ans in index2ans.items(): + if ans.lower() in response.lower(): + candidates.append(index) + index_ans = False + if len(candidates) == 0: + pred_index = "A" + + elif len(candidates) > 1: + start_indexes = [] + if index_ans: + if ans_with_period: + for can in candidates: + index = response.rfind(f"{can}.") + start_indexes.append(index) + elif ans_with_colon: + for can in candidates: + index = response.rfind(f"{can}:") + start_indexes.append(index) + elif ans_with_brack: + for can in candidates: + index = response.rfind(f"({can})") + start_indexes.append(index) + else: + for can in candidates: + index = response.rfind(f" {can} ") + start_indexes.append(index) + else: + for can in candidates: + index = response.lower().rfind(index2ans[can].lower()) + start_indexes.append(index) + pred_index = candidates[np.argmax(start_indexes)] + else: + pred_index = candidates[0] + + return pred_index + + +def lemonade_process_results(doc, results): + pred = results[0] + + index2ans, all_choices = get_multi_choice_info(eval(doc["Answers"])) + parsed_pred = parse_multi_choice_response(pred, all_choices, index2ans) + + acc = {"QID": doc["QID"], "category": doc["Category"], "subcategory": doc["Subcategory"], "difficulty": doc["Difficulty"], "answer": doc["Correct Answer"], "parsed_pred": parsed_pred, "original_pred": pred} + return {"acc": acc} + + +def lemonade_aggregate_results(results): + def compute_accuracy(grouped_results): + acc_dict = {} + for key, samples in grouped_results.items(): + correct = sum([r["parsed_pred"] == r["answer"] for r in samples]) + total = len(samples) + acc = round(correct / total, 5) if total > 0 else 0.0 + stderr = round(np.sqrt(acc * (1 - acc) / total), 5) if total > 0 else 0.0 + acc_dict[key] = { + "num": total, + "acc": acc, + "acc_stderr": stderr, + } + return acc_dict + + qid_results = defaultdict(list) + category_results = defaultdict(list) + subcategory_results = defaultdict(list) + difficulty_results = defaultdict(list) + + valid_results = [r for r in results if r["parsed_pred"] != "API Error"] + + for r in valid_results: + qid_results[r["QID"]].append(r) + category_results[r["category"]].append(r) + subcategory_results[r["subcategory"]].append(r) + difficulty_results[r["difficulty"]].append(r) + + qid_acc = compute_accuracy(qid_results) + category_acc = compute_accuracy(category_results) + subcategory_acc = compute_accuracy(subcategory_results) + difficulty_acc = compute_accuracy(difficulty_results) + + total_correct = sum([r["parsed_pred"] == r["answer"] for r in valid_results]) + total = len(valid_results) + overall_acc = round(total_correct / total, 5) if total > 0 else 0.0 + overall_stderr = round(np.sqrt(overall_acc * (1 - overall_acc) / total), 5) if total > 0 else 0.0 + + print("\nResults:") + + print("\nAccuracy per QID:") + for k, v in qid_acc.items(): + print(f" {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)") + + print("\nAccuracy per Category:") + for k, v in category_acc.items(): + print(f" {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)") + + print("\nAccuracy per Subcategory:") + for k, v in subcategory_acc.items(): + print(f" {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)") + + print("\nAccuracy per Difficulty:") + for k, v in difficulty_acc.items(): + print(f" {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)") + + print(f"\nOverall Accuracy: {overall_acc} ± {overall_stderr} ({total} examples)") + + return overall_acc \ No newline at end of file From fe24c04dd351139fbce20c5a3402d5c9da857717 Mon Sep 17 00:00:00 2001 From: Matea Tashkovska Date: Sat, 5 Jul 2025 20:59:31 +0000 Subject: [PATCH 02/12] Video loader with caching and download --- lmms_eval/tasks/lemonade/lemonade.yaml | 23 ++ lmms_eval/tasks/lemonade/utils.py | 282 +++++++++++++++++++++++++ 2 files changed, 305 insertions(+) create mode 100644 lmms_eval/tasks/lemonade/lemonade.yaml create mode 100644 lmms_eval/tasks/lemonade/utils.py diff --git a/lmms_eval/tasks/lemonade/lemonade.yaml b/lmms_eval/tasks/lemonade/lemonade.yaml new file mode 100644 index 000000000..22b81d02d --- /dev/null +++ b/lmms_eval/tasks/lemonade/lemonade.yaml @@ -0,0 +1,23 @@ +dataset_path: amathislab/LEMONADE +task: "lemonade" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.lemonade_doc_to_visual +doc_to_text: !function utils.lemonade_doc_to_text +doc_to_target: "Correct Answer" + +generation_kwargs: + max_new_tokens: 128 + temperature: 0 + do_sample: false + +process_results: !function utils.lemonade_process_results +metric_list: + - metric: acc + aggregation: !function utils.lemonade_aggregate_results + higher_is_better: true + +lmms_eval_specific_kwargs: + default: + pre_prompt: "Answer the following multiple-choice question using the given images.\n" + post_prompt: "\nRespond only with the letter of the correct answer." \ No newline at end of file diff --git a/lmms_eval/tasks/lemonade/utils.py b/lmms_eval/tasks/lemonade/utils.py new file mode 100644 index 000000000..7466c1a1f --- /dev/null +++ b/lmms_eval/tasks/lemonade/utils.py @@ -0,0 +1,282 @@ +import os +from datasets import load_dataset +from datetime import datetime +import cv2 +from collections import defaultdict +from PIL import Image +import numpy as np +import requests +import zipfile +from tqdm import tqdm +import lmms_eval.tasks._task_utils.file_utils as file_utils +from lmms_eval.utils import load_yaml_config +from huggingface_hub import hf_hub_download + +MAX_NUM_FRAMES = 8 + +LEMONADE_ZIP_NAMES = [ + "videos_batch_0.zip", + "videos_batch_1.zip", + "videos_batch_2.zip", + "videos_batch_3.zip", + "videos_batch_4.zip", +] + +data_dir = "./data/lemonade" + +def download_and_extract_lemonade_videos(data_dir): + os.makedirs(data_dir, exist_ok=True) + videos_dir = os.path.join(data_dir, "videos") + os.makedirs(videos_dir, exist_ok=True) + + for zip_name in LEMONADE_ZIP_NAMES: + print(f"Downloading {zip_name} from Hugging Face...") + zip_path = hf_hub_download( + repo_id="amathislab/LEMONADE", + filename=zip_name, + repo_type="dataset", + cache_dir=os.path.join(data_dir, "cache") + ) + + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + zip_ref.extractall(videos_dir) + + print("All videos downloaded and extracted successfully.\n") + +def load_video(video_file, start_frame, end_frame, max_num_frames=MAX_NUM_FRAMES): + + cap = cv2.VideoCapture(video_file) + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + + start_frame = max(0, start_frame) + end_frame = min(end_frame, total_frames - 1) + total_valid_frames = end_frame - start_frame + 1 + num_frames = min(max_num_frames, total_valid_frames) + + step = total_valid_frames / num_frames + frame_indices = [int(start_frame + i * step) for i in range(num_frames)] + + frames = [] + for target_idx in frame_indices: + cap.set(cv2.CAP_PROP_POS_FRAMES, target_idx) + success, frame = cap.read() + if not success: + continue + frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + pil_img = Image.fromarray(frame_rgb).convert("RGB") + frames.append(pil_img) + + cap.release() + return frames + + +def parse_options(options): + option_letters = [chr(ord("A") + i) for i in range(len(options))] + + if all(option.startswith(f"{letter}.") for option, letter in zip(options, option_letters)): + return "\n".join(options) + + choices_str = "\n".join([f"{option_letter}. {option}" for option_letter, option in zip(option_letters, options)]) + return choices_str + + +def lemonade_doc_to_visual(doc): + videos_dir = os.path.join(data_dir, "videos") + + if not os.path.exists(videos_dir) or len(os.listdir(videos_dir)) == 0: + print("Videos directory is empty — downloading and extracting...\n") + download_and_extract_lemonade_videos(data_dir) + + video_filename = doc["Clip"] + "_hololens.mp4" + + video_path = os.path.join( + videos_dir, + video_filename + ) + + if os.path.exists(video_path): + start = int(doc["Start"]) + end = int(doc["End"]) + frames = load_video(video_path, start, end, max_num_frames=MAX_NUM_FRAMES) + else: + raise FileNotFoundError(f"Video file not found: {video_path}") + + return frames + + +def lemonade_doc_to_text(doc, lmms_eval_specific_kwargs=None): + if lmms_eval_specific_kwargs is None: + lmms_eval_specific_kwargs = {} + + pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "") + post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "") + + question = "Question: " + doc["Question"] + parsed_options = parse_options(eval(doc["Answers"])) + choices = "Choices:\n" + parsed_options + + return f"{pre_prompt}{question}\n{choices}{post_prompt}" + + +def get_multi_choice_info(options): + """ + Given the list of options for multiple choice question + Return the index2ans and all_choices + """ + assert isinstance(options, list), f"Expected list of options, got {type(options)}: {options}" + + start_chr = "A" + all_choices = [] + index2ans = {} + for i, option in enumerate(options): + index2ans[chr(ord(start_chr) + i)] = option + all_choices.append(chr(ord(start_chr) + i)) + + return index2ans, all_choices + + +def parse_multi_choice_response(response, all_choices, index2ans): + """ + Parse the prediction from the generated response. + Return the predicted index e.g., A, B, C, D. + """ + if response == "API Error": + return "API Error" + + if response == "": + return "Empty Response" + + for char in [",", ".", "!", "?", ";", ":", "'"]: + response = response.strip(char) + response = " " + response + " " + + index_ans = True + ans_with_brack = False + ans_with_period = False + ans_with_colon = False + candidates = [] + + for choice in all_choices: + if f"{choice}." in response: + candidates.append(choice) + ans_with_period = True + for choice in all_choices: + if f"{choice}:" in response: + candidates.append(choice) + ans_with_colon = True + if len(candidates) == 0: + for choice in all_choices: + if f"({choice})" in response: + candidates.append(choice) + ans_with_brack = True + if len(candidates) == 0: + for choice in all_choices: + if f"{choice} " in response: + candidates.append(choice) + if len(candidates) == 0 and len(response.split()) > 5: + for index, ans in index2ans.items(): + if ans.lower() in response.lower(): + candidates.append(index) + index_ans = False + if len(candidates) == 0: + pred_index = "A" + + elif len(candidates) > 1: + start_indexes = [] + if index_ans: + if ans_with_period: + for can in candidates: + index = response.rfind(f"{can}.") + start_indexes.append(index) + elif ans_with_colon: + for can in candidates: + index = response.rfind(f"{can}:") + start_indexes.append(index) + elif ans_with_brack: + for can in candidates: + index = response.rfind(f"({can})") + start_indexes.append(index) + else: + for can in candidates: + index = response.rfind(f" {can} ") + start_indexes.append(index) + else: + for can in candidates: + index = response.lower().rfind(index2ans[can].lower()) + start_indexes.append(index) + pred_index = candidates[np.argmax(start_indexes)] + else: + pred_index = candidates[0] + + return pred_index + + +def lemonade_process_results(doc, results): + pred = results[0] + + index2ans, all_choices = get_multi_choice_info(eval(doc["Answers"])) + parsed_pred = parse_multi_choice_response(pred, all_choices, index2ans) + + acc = {"QID": doc["QID"], "category": doc["Category"], "subcategory": doc["Subcategory"], "difficulty": doc["Difficulty"], "answer": doc["Correct Answer"], "parsed_pred": parsed_pred, "original_pred": pred} + return {"acc": acc} + + +def lemonade_aggregate_results(results): + def compute_accuracy(grouped_results): + acc_dict = {} + for key, samples in grouped_results.items(): + correct = sum([r["parsed_pred"] == r["answer"] for r in samples]) + total = len(samples) + acc = round(correct / total, 5) if total > 0 else 0.0 + stderr = round(np.sqrt(acc * (1 - acc) / total), 5) if total > 0 else 0.0 + acc_dict[key] = { + "num": total, + "acc": acc, + "acc_stderr": stderr, + } + return acc_dict + + qid_results = defaultdict(list) + category_results = defaultdict(list) + subcategory_results = defaultdict(list) + difficulty_results = defaultdict(list) + + valid_results = [r for r in results if r["parsed_pred"] != "API Error"] + + for r in valid_results: + qid_results[r["QID"]].append(r) + category_results[r["category"]].append(r) + subcategory_results[r["subcategory"]].append(r) + difficulty_results[r["difficulty"]].append(r) + + qid_acc = compute_accuracy(qid_results) + category_acc = compute_accuracy(category_results) + subcategory_acc = compute_accuracy(subcategory_results) + difficulty_acc = compute_accuracy(difficulty_results) + + total_correct = sum([r["parsed_pred"] == r["answer"] for r in valid_results]) + total = len(valid_results) + overall_acc = round(total_correct / total, 5) if total > 0 else 0.0 + overall_stderr = round(np.sqrt(overall_acc * (1 - overall_acc) / total), 5) if total > 0 else 0.0 + + print("\nResults:") + + print("\nAccuracy per QID:") + for k, v in qid_acc.items(): + print(f" {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)") + + print("\nAccuracy per Category:") + for k, v in category_acc.items(): + print(f" {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)") + + print("\nAccuracy per Subcategory:") + for k, v in subcategory_acc.items(): + print(f" {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)") + + print("\nAccuracy per Difficulty:") + for k, v in difficulty_acc.items(): + print(f" {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)") + + print(f"\nOverall Accuracy: {overall_acc} ± {overall_stderr} ({total} examples)") + + return overall_acc \ No newline at end of file From 99af1df198812e22721e5f434e66d80463dcc8a4 Mon Sep 17 00:00:00 2001 From: andy Date: Mon, 18 Aug 2025 16:46:50 +0200 Subject: [PATCH 03/12] black and isort formating --- lmms_eval/tasks/lemonade/utils.py | 53 ++++++++++++++----------------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/lmms_eval/tasks/lemonade/utils.py b/lmms_eval/tasks/lemonade/utils.py index 7466c1a1f..c3b2f2c83 100644 --- a/lmms_eval/tasks/lemonade/utils.py +++ b/lmms_eval/tasks/lemonade/utils.py @@ -1,16 +1,17 @@ import os -from datasets import load_dataset +import zipfile +from collections import defaultdict from datetime import datetime + import cv2 -from collections import defaultdict -from PIL import Image +import lmms_eval.tasks._task_utils.file_utils as file_utils import numpy as np import requests -import zipfile -from tqdm import tqdm -import lmms_eval.tasks._task_utils.file_utils as file_utils -from lmms_eval.utils import load_yaml_config +from datasets import load_dataset from huggingface_hub import hf_hub_download +from lmms_eval.utils import load_yaml_config +from PIL import Image +from tqdm import tqdm MAX_NUM_FRAMES = 8 @@ -24,6 +25,7 @@ data_dir = "./data/lemonade" + def download_and_extract_lemonade_videos(data_dir): os.makedirs(data_dir, exist_ok=True) videos_dir = os.path.join(data_dir, "videos") @@ -31,18 +33,14 @@ def download_and_extract_lemonade_videos(data_dir): for zip_name in LEMONADE_ZIP_NAMES: print(f"Downloading {zip_name} from Hugging Face...") - zip_path = hf_hub_download( - repo_id="amathislab/LEMONADE", - filename=zip_name, - repo_type="dataset", - cache_dir=os.path.join(data_dir, "cache") - ) - - with zipfile.ZipFile(zip_path, 'r') as zip_ref: + zip_path = hf_hub_download(repo_id="amathislab/LEMONADE", filename=zip_name, repo_type="dataset", cache_dir=os.path.join(data_dir, "cache")) + + with zipfile.ZipFile(zip_path, "r") as zip_ref: zip_ref.extractall(videos_dir) - + print("All videos downloaded and extracted successfully.\n") + def load_video(video_file, start_frame, end_frame, max_num_frames=MAX_NUM_FRAMES): cap = cv2.VideoCapture(video_file) @@ -89,10 +87,7 @@ def lemonade_doc_to_visual(doc): video_filename = doc["Clip"] + "_hololens.mp4" - video_path = os.path.join( - videos_dir, - video_filename - ) + video_path = os.path.join(videos_dir, video_filename) if os.path.exists(video_path): start = int(doc["Start"]) @@ -102,15 +97,15 @@ def lemonade_doc_to_visual(doc): raise FileNotFoundError(f"Video file not found: {video_path}") return frames - + def lemonade_doc_to_text(doc, lmms_eval_specific_kwargs=None): if lmms_eval_specific_kwargs is None: lmms_eval_specific_kwargs = {} - + pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "") post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "") - + question = "Question: " + doc["Question"] parsed_options = parse_options(eval(doc["Answers"])) choices = "Choices:\n" + parsed_options @@ -148,7 +143,7 @@ def parse_multi_choice_response(response, all_choices, index2ans): for char in [",", ".", "!", "?", ";", ":", "'"]: response = response.strip(char) - response = " " + response + " " + response = " " + response + " " index_ans = True ans_with_brack = False @@ -160,7 +155,7 @@ def parse_multi_choice_response(response, all_choices, index2ans): if f"{choice}." in response: candidates.append(choice) ans_with_period = True - for choice in all_choices: + for choice in all_choices: if f"{choice}:" in response: candidates.append(choice) ans_with_colon = True @@ -170,14 +165,14 @@ def parse_multi_choice_response(response, all_choices, index2ans): candidates.append(choice) ans_with_brack = True if len(candidates) == 0: - for choice in all_choices: + for choice in all_choices: if f"{choice} " in response: candidates.append(choice) if len(candidates) == 0 and len(response.split()) > 5: for index, ans in index2ans.items(): if ans.lower() in response.lower(): candidates.append(index) - index_ans = False + index_ans = False if len(candidates) == 0: pred_index = "A" @@ -278,5 +273,5 @@ def compute_accuracy(grouped_results): print(f" {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)") print(f"\nOverall Accuracy: {overall_acc} ± {overall_stderr} ({total} examples)") - - return overall_acc \ No newline at end of file + + return overall_acc From 34d435fa0f7f7bcd1aae3b1e1db82bc75ec38595 Mon Sep 17 00:00:00 2001 From: andy Date: Mon, 18 Aug 2025 17:01:32 +0200 Subject: [PATCH 04/12] clean imports --- lmms_eval/tasks/lemonade/utils.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/lmms_eval/tasks/lemonade/utils.py b/lmms_eval/tasks/lemonade/utils.py index c3b2f2c83..bb51677b2 100644 --- a/lmms_eval/tasks/lemonade/utils.py +++ b/lmms_eval/tasks/lemonade/utils.py @@ -1,17 +1,11 @@ import os import zipfile from collections import defaultdict -from datetime import datetime import cv2 -import lmms_eval.tasks._task_utils.file_utils as file_utils import numpy as np -import requests -from datasets import load_dataset from huggingface_hub import hf_hub_download -from lmms_eval.utils import load_yaml_config from PIL import Image -from tqdm import tqdm MAX_NUM_FRAMES = 8 From a9caecd79fa6e353f4760d777c3cb6cbfdadd94d Mon Sep 17 00:00:00 2001 From: Matea Tashkovska Date: Sat, 5 Jul 2025 20:59:31 +0000 Subject: [PATCH 05/12] Video loader with caching and download --- lmms_eval/tasks/lemonade/lemonade.yaml | 23 ++ lmms_eval/tasks/lemonade/utils.py | 282 +++++++++++++++++++++++++ 2 files changed, 305 insertions(+) create mode 100644 lmms_eval/tasks/lemonade/lemonade.yaml create mode 100644 lmms_eval/tasks/lemonade/utils.py diff --git a/lmms_eval/tasks/lemonade/lemonade.yaml b/lmms_eval/tasks/lemonade/lemonade.yaml new file mode 100644 index 000000000..22b81d02d --- /dev/null +++ b/lmms_eval/tasks/lemonade/lemonade.yaml @@ -0,0 +1,23 @@ +dataset_path: amathislab/LEMONADE +task: "lemonade" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.lemonade_doc_to_visual +doc_to_text: !function utils.lemonade_doc_to_text +doc_to_target: "Correct Answer" + +generation_kwargs: + max_new_tokens: 128 + temperature: 0 + do_sample: false + +process_results: !function utils.lemonade_process_results +metric_list: + - metric: acc + aggregation: !function utils.lemonade_aggregate_results + higher_is_better: true + +lmms_eval_specific_kwargs: + default: + pre_prompt: "Answer the following multiple-choice question using the given images.\n" + post_prompt: "\nRespond only with the letter of the correct answer." \ No newline at end of file diff --git a/lmms_eval/tasks/lemonade/utils.py b/lmms_eval/tasks/lemonade/utils.py new file mode 100644 index 000000000..7466c1a1f --- /dev/null +++ b/lmms_eval/tasks/lemonade/utils.py @@ -0,0 +1,282 @@ +import os +from datasets import load_dataset +from datetime import datetime +import cv2 +from collections import defaultdict +from PIL import Image +import numpy as np +import requests +import zipfile +from tqdm import tqdm +import lmms_eval.tasks._task_utils.file_utils as file_utils +from lmms_eval.utils import load_yaml_config +from huggingface_hub import hf_hub_download + +MAX_NUM_FRAMES = 8 + +LEMONADE_ZIP_NAMES = [ + "videos_batch_0.zip", + "videos_batch_1.zip", + "videos_batch_2.zip", + "videos_batch_3.zip", + "videos_batch_4.zip", +] + +data_dir = "./data/lemonade" + +def download_and_extract_lemonade_videos(data_dir): + os.makedirs(data_dir, exist_ok=True) + videos_dir = os.path.join(data_dir, "videos") + os.makedirs(videos_dir, exist_ok=True) + + for zip_name in LEMONADE_ZIP_NAMES: + print(f"Downloading {zip_name} from Hugging Face...") + zip_path = hf_hub_download( + repo_id="amathislab/LEMONADE", + filename=zip_name, + repo_type="dataset", + cache_dir=os.path.join(data_dir, "cache") + ) + + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + zip_ref.extractall(videos_dir) + + print("All videos downloaded and extracted successfully.\n") + +def load_video(video_file, start_frame, end_frame, max_num_frames=MAX_NUM_FRAMES): + + cap = cv2.VideoCapture(video_file) + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + + start_frame = max(0, start_frame) + end_frame = min(end_frame, total_frames - 1) + total_valid_frames = end_frame - start_frame + 1 + num_frames = min(max_num_frames, total_valid_frames) + + step = total_valid_frames / num_frames + frame_indices = [int(start_frame + i * step) for i in range(num_frames)] + + frames = [] + for target_idx in frame_indices: + cap.set(cv2.CAP_PROP_POS_FRAMES, target_idx) + success, frame = cap.read() + if not success: + continue + frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + pil_img = Image.fromarray(frame_rgb).convert("RGB") + frames.append(pil_img) + + cap.release() + return frames + + +def parse_options(options): + option_letters = [chr(ord("A") + i) for i in range(len(options))] + + if all(option.startswith(f"{letter}.") for option, letter in zip(options, option_letters)): + return "\n".join(options) + + choices_str = "\n".join([f"{option_letter}. {option}" for option_letter, option in zip(option_letters, options)]) + return choices_str + + +def lemonade_doc_to_visual(doc): + videos_dir = os.path.join(data_dir, "videos") + + if not os.path.exists(videos_dir) or len(os.listdir(videos_dir)) == 0: + print("Videos directory is empty — downloading and extracting...\n") + download_and_extract_lemonade_videos(data_dir) + + video_filename = doc["Clip"] + "_hololens.mp4" + + video_path = os.path.join( + videos_dir, + video_filename + ) + + if os.path.exists(video_path): + start = int(doc["Start"]) + end = int(doc["End"]) + frames = load_video(video_path, start, end, max_num_frames=MAX_NUM_FRAMES) + else: + raise FileNotFoundError(f"Video file not found: {video_path}") + + return frames + + +def lemonade_doc_to_text(doc, lmms_eval_specific_kwargs=None): + if lmms_eval_specific_kwargs is None: + lmms_eval_specific_kwargs = {} + + pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "") + post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "") + + question = "Question: " + doc["Question"] + parsed_options = parse_options(eval(doc["Answers"])) + choices = "Choices:\n" + parsed_options + + return f"{pre_prompt}{question}\n{choices}{post_prompt}" + + +def get_multi_choice_info(options): + """ + Given the list of options for multiple choice question + Return the index2ans and all_choices + """ + assert isinstance(options, list), f"Expected list of options, got {type(options)}: {options}" + + start_chr = "A" + all_choices = [] + index2ans = {} + for i, option in enumerate(options): + index2ans[chr(ord(start_chr) + i)] = option + all_choices.append(chr(ord(start_chr) + i)) + + return index2ans, all_choices + + +def parse_multi_choice_response(response, all_choices, index2ans): + """ + Parse the prediction from the generated response. + Return the predicted index e.g., A, B, C, D. + """ + if response == "API Error": + return "API Error" + + if response == "": + return "Empty Response" + + for char in [",", ".", "!", "?", ";", ":", "'"]: + response = response.strip(char) + response = " " + response + " " + + index_ans = True + ans_with_brack = False + ans_with_period = False + ans_with_colon = False + candidates = [] + + for choice in all_choices: + if f"{choice}." in response: + candidates.append(choice) + ans_with_period = True + for choice in all_choices: + if f"{choice}:" in response: + candidates.append(choice) + ans_with_colon = True + if len(candidates) == 0: + for choice in all_choices: + if f"({choice})" in response: + candidates.append(choice) + ans_with_brack = True + if len(candidates) == 0: + for choice in all_choices: + if f"{choice} " in response: + candidates.append(choice) + if len(candidates) == 0 and len(response.split()) > 5: + for index, ans in index2ans.items(): + if ans.lower() in response.lower(): + candidates.append(index) + index_ans = False + if len(candidates) == 0: + pred_index = "A" + + elif len(candidates) > 1: + start_indexes = [] + if index_ans: + if ans_with_period: + for can in candidates: + index = response.rfind(f"{can}.") + start_indexes.append(index) + elif ans_with_colon: + for can in candidates: + index = response.rfind(f"{can}:") + start_indexes.append(index) + elif ans_with_brack: + for can in candidates: + index = response.rfind(f"({can})") + start_indexes.append(index) + else: + for can in candidates: + index = response.rfind(f" {can} ") + start_indexes.append(index) + else: + for can in candidates: + index = response.lower().rfind(index2ans[can].lower()) + start_indexes.append(index) + pred_index = candidates[np.argmax(start_indexes)] + else: + pred_index = candidates[0] + + return pred_index + + +def lemonade_process_results(doc, results): + pred = results[0] + + index2ans, all_choices = get_multi_choice_info(eval(doc["Answers"])) + parsed_pred = parse_multi_choice_response(pred, all_choices, index2ans) + + acc = {"QID": doc["QID"], "category": doc["Category"], "subcategory": doc["Subcategory"], "difficulty": doc["Difficulty"], "answer": doc["Correct Answer"], "parsed_pred": parsed_pred, "original_pred": pred} + return {"acc": acc} + + +def lemonade_aggregate_results(results): + def compute_accuracy(grouped_results): + acc_dict = {} + for key, samples in grouped_results.items(): + correct = sum([r["parsed_pred"] == r["answer"] for r in samples]) + total = len(samples) + acc = round(correct / total, 5) if total > 0 else 0.0 + stderr = round(np.sqrt(acc * (1 - acc) / total), 5) if total > 0 else 0.0 + acc_dict[key] = { + "num": total, + "acc": acc, + "acc_stderr": stderr, + } + return acc_dict + + qid_results = defaultdict(list) + category_results = defaultdict(list) + subcategory_results = defaultdict(list) + difficulty_results = defaultdict(list) + + valid_results = [r for r in results if r["parsed_pred"] != "API Error"] + + for r in valid_results: + qid_results[r["QID"]].append(r) + category_results[r["category"]].append(r) + subcategory_results[r["subcategory"]].append(r) + difficulty_results[r["difficulty"]].append(r) + + qid_acc = compute_accuracy(qid_results) + category_acc = compute_accuracy(category_results) + subcategory_acc = compute_accuracy(subcategory_results) + difficulty_acc = compute_accuracy(difficulty_results) + + total_correct = sum([r["parsed_pred"] == r["answer"] for r in valid_results]) + total = len(valid_results) + overall_acc = round(total_correct / total, 5) if total > 0 else 0.0 + overall_stderr = round(np.sqrt(overall_acc * (1 - overall_acc) / total), 5) if total > 0 else 0.0 + + print("\nResults:") + + print("\nAccuracy per QID:") + for k, v in qid_acc.items(): + print(f" {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)") + + print("\nAccuracy per Category:") + for k, v in category_acc.items(): + print(f" {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)") + + print("\nAccuracy per Subcategory:") + for k, v in subcategory_acc.items(): + print(f" {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)") + + print("\nAccuracy per Difficulty:") + for k, v in difficulty_acc.items(): + print(f" {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)") + + print(f"\nOverall Accuracy: {overall_acc} ± {overall_stderr} ({total} examples)") + + return overall_acc \ No newline at end of file From 87cf67a1dff36312d9b3e880b52772a31b07fb63 Mon Sep 17 00:00:00 2001 From: andy Date: Mon, 18 Aug 2025 16:46:50 +0200 Subject: [PATCH 06/12] black and isort formating --- lmms_eval/tasks/lemonade/utils.py | 53 ++++++++++++++----------------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/lmms_eval/tasks/lemonade/utils.py b/lmms_eval/tasks/lemonade/utils.py index 7466c1a1f..c3b2f2c83 100644 --- a/lmms_eval/tasks/lemonade/utils.py +++ b/lmms_eval/tasks/lemonade/utils.py @@ -1,16 +1,17 @@ import os -from datasets import load_dataset +import zipfile +from collections import defaultdict from datetime import datetime + import cv2 -from collections import defaultdict -from PIL import Image +import lmms_eval.tasks._task_utils.file_utils as file_utils import numpy as np import requests -import zipfile -from tqdm import tqdm -import lmms_eval.tasks._task_utils.file_utils as file_utils -from lmms_eval.utils import load_yaml_config +from datasets import load_dataset from huggingface_hub import hf_hub_download +from lmms_eval.utils import load_yaml_config +from PIL import Image +from tqdm import tqdm MAX_NUM_FRAMES = 8 @@ -24,6 +25,7 @@ data_dir = "./data/lemonade" + def download_and_extract_lemonade_videos(data_dir): os.makedirs(data_dir, exist_ok=True) videos_dir = os.path.join(data_dir, "videos") @@ -31,18 +33,14 @@ def download_and_extract_lemonade_videos(data_dir): for zip_name in LEMONADE_ZIP_NAMES: print(f"Downloading {zip_name} from Hugging Face...") - zip_path = hf_hub_download( - repo_id="amathislab/LEMONADE", - filename=zip_name, - repo_type="dataset", - cache_dir=os.path.join(data_dir, "cache") - ) - - with zipfile.ZipFile(zip_path, 'r') as zip_ref: + zip_path = hf_hub_download(repo_id="amathislab/LEMONADE", filename=zip_name, repo_type="dataset", cache_dir=os.path.join(data_dir, "cache")) + + with zipfile.ZipFile(zip_path, "r") as zip_ref: zip_ref.extractall(videos_dir) - + print("All videos downloaded and extracted successfully.\n") + def load_video(video_file, start_frame, end_frame, max_num_frames=MAX_NUM_FRAMES): cap = cv2.VideoCapture(video_file) @@ -89,10 +87,7 @@ def lemonade_doc_to_visual(doc): video_filename = doc["Clip"] + "_hololens.mp4" - video_path = os.path.join( - videos_dir, - video_filename - ) + video_path = os.path.join(videos_dir, video_filename) if os.path.exists(video_path): start = int(doc["Start"]) @@ -102,15 +97,15 @@ def lemonade_doc_to_visual(doc): raise FileNotFoundError(f"Video file not found: {video_path}") return frames - + def lemonade_doc_to_text(doc, lmms_eval_specific_kwargs=None): if lmms_eval_specific_kwargs is None: lmms_eval_specific_kwargs = {} - + pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "") post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "") - + question = "Question: " + doc["Question"] parsed_options = parse_options(eval(doc["Answers"])) choices = "Choices:\n" + parsed_options @@ -148,7 +143,7 @@ def parse_multi_choice_response(response, all_choices, index2ans): for char in [",", ".", "!", "?", ";", ":", "'"]: response = response.strip(char) - response = " " + response + " " + response = " " + response + " " index_ans = True ans_with_brack = False @@ -160,7 +155,7 @@ def parse_multi_choice_response(response, all_choices, index2ans): if f"{choice}." in response: candidates.append(choice) ans_with_period = True - for choice in all_choices: + for choice in all_choices: if f"{choice}:" in response: candidates.append(choice) ans_with_colon = True @@ -170,14 +165,14 @@ def parse_multi_choice_response(response, all_choices, index2ans): candidates.append(choice) ans_with_brack = True if len(candidates) == 0: - for choice in all_choices: + for choice in all_choices: if f"{choice} " in response: candidates.append(choice) if len(candidates) == 0 and len(response.split()) > 5: for index, ans in index2ans.items(): if ans.lower() in response.lower(): candidates.append(index) - index_ans = False + index_ans = False if len(candidates) == 0: pred_index = "A" @@ -278,5 +273,5 @@ def compute_accuracy(grouped_results): print(f" {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)") print(f"\nOverall Accuracy: {overall_acc} ± {overall_stderr} ({total} examples)") - - return overall_acc \ No newline at end of file + + return overall_acc From 0391cd4e24a183008c9678dbdb074793eec1a798 Mon Sep 17 00:00:00 2001 From: andy Date: Mon, 18 Aug 2025 17:01:32 +0200 Subject: [PATCH 07/12] clean imports --- lmms_eval/tasks/lemonade/utils.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/lmms_eval/tasks/lemonade/utils.py b/lmms_eval/tasks/lemonade/utils.py index c3b2f2c83..bb51677b2 100644 --- a/lmms_eval/tasks/lemonade/utils.py +++ b/lmms_eval/tasks/lemonade/utils.py @@ -1,17 +1,11 @@ import os import zipfile from collections import defaultdict -from datetime import datetime import cv2 -import lmms_eval.tasks._task_utils.file_utils as file_utils import numpy as np -import requests -from datasets import load_dataset from huggingface_hub import hf_hub_download -from lmms_eval.utils import load_yaml_config from PIL import Image -from tqdm import tqdm MAX_NUM_FRAMES = 8 From 060935d4b388087019bfc1be33c7b85e26aaf05b Mon Sep 17 00:00:00 2001 From: Matea Tashkovska Date: Mon, 8 Sep 2025 08:55:34 +0000 Subject: [PATCH 08/12] implement coderabbitai comments --- lmms_eval/tasks/lemonade/utils.py | 217 +++++++++++++++++++++--------- 1 file changed, 153 insertions(+), 64 deletions(-) diff --git a/lmms_eval/tasks/lemonade/utils.py b/lmms_eval/tasks/lemonade/utils.py index bb51677b2..30ba75241 100644 --- a/lmms_eval/tasks/lemonade/utils.py +++ b/lmms_eval/tasks/lemonade/utils.py @@ -1,14 +1,15 @@ +import ast import os -import zipfile -from collections import defaultdict - import cv2 import numpy as np -from huggingface_hub import hf_hub_download +import yaml +import zipfile +from collections import defaultdict from PIL import Image +from typing import Any, Optional +from huggingface_hub import hf_hub_download MAX_NUM_FRAMES = 8 - LEMONADE_ZIP_NAMES = [ "videos_batch_0.zip", "videos_batch_1.zip", @@ -16,53 +17,82 @@ "videos_batch_3.zip", "videos_batch_4.zip", ] +DEFAULT_DATA_DIR = "./data/lemonade" -data_dir = "./data/lemonade" +def download_and_extract_lemonade_videos(data_dir: str) -> None: + """ + Download and extract LEMONADE files from Hugging Face into a local data directory. + Args: + data_dir: Directory that stores the files. + Returns: + None + """ -def download_and_extract_lemonade_videos(data_dir): os.makedirs(data_dir, exist_ok=True) videos_dir = os.path.join(data_dir, "videos") os.makedirs(videos_dir, exist_ok=True) for zip_name in LEMONADE_ZIP_NAMES: print(f"Downloading {zip_name} from Hugging Face...") - zip_path = hf_hub_download(repo_id="amathislab/LEMONADE", filename=zip_name, repo_type="dataset", cache_dir=os.path.join(data_dir, "cache")) - - with zipfile.ZipFile(zip_path, "r") as zip_ref: + zip_path = hf_hub_download( + repo_id="amathislab/LEMONADE", + filename=zip_name, + repo_type="dataset", + cache_dir=os.path.join(data_dir, "cache") + ) + + with zipfile.ZipFile(zip_path, 'r') as zip_ref: zip_ref.extractall(videos_dir) - + print("All videos downloaded and extracted successfully.\n") - -def load_video(video_file, start_frame, end_frame, max_num_frames=MAX_NUM_FRAMES): +def load_video(video_file: str, start_frame: int, end_frame: int, max_num_frames: int = MAX_NUM_FRAMES) -> list[Image.Image]: + """ + Args: + video_file: Path to the video file. + start_frame: Starting frame index. + end_frame: Ending frame index. + max_num_frames: Number of frames to sample from the video segment. + Returns: + List of PIL Image objects representing sampled frames + """ cap = cv2.VideoCapture(video_file) - total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - - start_frame = max(0, start_frame) - end_frame = min(end_frame, total_frames - 1) - total_valid_frames = end_frame - start_frame + 1 - num_frames = min(max_num_frames, total_valid_frames) - - step = total_valid_frames / num_frames - frame_indices = [int(start_frame + i * step) for i in range(num_frames)] - - frames = [] - for target_idx in frame_indices: - cap.set(cv2.CAP_PROP_POS_FRAMES, target_idx) - success, frame = cap.read() - if not success: - continue - frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) - pil_img = Image.fromarray(frame_rgb).convert("RGB") - frames.append(pil_img) - - cap.release() - return frames - + try: + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + start_frame = max(0, start_frame) + end_frame = min(end_frame, total_frames - 1) + total_valid_frames = end_frame - start_frame + 1 + num_frames = min(max_num_frames, total_valid_frames) + step = total_valid_frames / num_frames + frame_indices = [int(start_frame + i * step) for i in range(num_frames)] + frames = [] + for target_idx in frame_indices: + cap.set(cv2.CAP_PROP_POS_FRAMES, target_idx) + success, frame = cap.read() + if not success: + continue + frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + pil_img = Image.fromarray(frame_rgb).convert("RGB") + frames.append(pil_img) + + return frames + finally: + cap.release() + +def parse_options(options: list[str]) -> str: + """ + Format a list of multiple-choice options into a string. + The function assigns letters to each option and returns them in a newline-separated string. + + Args: + options (list[str]): A list of option strings. + + Returns: + str: A formatted string with each option on a new line, prefixed by its corresponding letter. + """ -def parse_options(options): option_letters = [chr(ord("A") + i) for i in range(len(options))] if all(option.startswith(f"{letter}.") for option, letter in zip(options, option_letters)): @@ -72,15 +102,22 @@ def parse_options(options): return choices_str -def lemonade_doc_to_visual(doc): - videos_dir = os.path.join(data_dir, "videos") +def lemonade_doc_to_visual(doc: dict[str, Any]) -> list[Image.Image]: + """ + Load video frames for a given entry in the LEMONADE dataset. + + Args: + doc: A dictionary representing an entry in the dataset. + Returns: + frames: List of PIL Image objects representing sampled frames + """ + videos_dir = os.path.join(DEFAULT_DATA_DIR, "videos") if not os.path.exists(videos_dir) or len(os.listdir(videos_dir)) == 0: print("Videos directory is empty — downloading and extracting...\n") - download_and_extract_lemonade_videos(data_dir) + download_and_extract_lemonade_videos(DEFAULT_DATA_DIR) video_filename = doc["Clip"] + "_hololens.mp4" - video_path = os.path.join(videos_dir, video_filename) if os.path.exists(video_path): @@ -88,32 +125,51 @@ def lemonade_doc_to_visual(doc): end = int(doc["End"]) frames = load_video(video_path, start, end, max_num_frames=MAX_NUM_FRAMES) else: - raise FileNotFoundError(f"Video file not found: {video_path}") - + raise FileNotFoundError( + f"Video file not found: {video_path}. " + f"Expected video for clip '{doc['Clip']}' at {video_path}" + ) return frames + +def lemonade_doc_to_text(doc: dict[str, Any], lmms_eval_specific_kwargs: Optional[dict[str, Any]] = None) -> str: + """ + Convert a LEMONADE dataset entry into a formatted text prompt. + Args: + doc: A dictionary representing an entry in the dataset. + lmms_eval_specific_kwargs: Optional dictionary for additional prompt formatting. + Returns: + str: A formatted prompt string ready for model input + """ -def lemonade_doc_to_text(doc, lmms_eval_specific_kwargs=None): if lmms_eval_specific_kwargs is None: lmms_eval_specific_kwargs = {} - + pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "") post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "") - + question = "Question: " + doc["Question"] - parsed_options = parse_options(eval(doc["Answers"])) + parsed_options = parse_options(ast.literal_eval(doc["Answers"])) choices = "Choices:\n" + parsed_options return f"{pre_prompt}{question}\n{choices}{post_prompt}" -def get_multi_choice_info(options): +def get_multi_choice_info(options: list[str]) -> tuple[dict[str, str], list[str]]: """ - Given the list of options for multiple choice question - Return the index2ans and all_choices + Map a list of options to letter labels (A, B, C, ...). + + Args: + options: The set of answer options + Returns: + tuple[dict[str, str], list[str]]: + - index2ans: Mapping from letters to option text. + - all_choices: List of the assigned letters. """ - assert isinstance(options, list), f"Expected list of options, got {type(options)}: {options}" - + + if not isinstance(options, list): + raise TypeError(f"Expected list of options, got {type(options)}: {options}") + start_chr = "A" all_choices = [] index2ans = {} @@ -124,11 +180,18 @@ def get_multi_choice_info(options): return index2ans, all_choices -def parse_multi_choice_response(response, all_choices, index2ans): +def parse_multi_choice_response(response: str, all_choices: list[str], index2ans: dict[str, str]) -> str: """ - Parse the prediction from the generated response. - Return the predicted index e.g., A, B, C, D. + Parse a model response and return the predicted choice label (e.g., "A", "B", "C", "D"). + + Args: + response (str): The generated response to parse. + all_choices (list[str]): The set of valid choice labels. + index2ans (dict[str, str]): Mapping from choice labels to their full answer text. + Returns: + str: The predicted choice label. """ + if response == "API Error": return "API Error" @@ -137,7 +200,7 @@ def parse_multi_choice_response(response, all_choices, index2ans): for char in [",", ".", "!", "?", ";", ":", "'"]: response = response.strip(char) - response = " " + response + " " + response = " " + response + " " index_ans = True ans_with_brack = False @@ -149,7 +212,7 @@ def parse_multi_choice_response(response, all_choices, index2ans): if f"{choice}." in response: candidates.append(choice) ans_with_period = True - for choice in all_choices: + for choice in all_choices: if f"{choice}:" in response: candidates.append(choice) ans_with_colon = True @@ -159,14 +222,14 @@ def parse_multi_choice_response(response, all_choices, index2ans): candidates.append(choice) ans_with_brack = True if len(candidates) == 0: - for choice in all_choices: + for choice in all_choices: if f"{choice} " in response: candidates.append(choice) if len(candidates) == 0 and len(response.split()) > 5: for index, ans in index2ans.items(): if ans.lower() in response.lower(): candidates.append(index) - index_ans = False + index_ans = False if len(candidates) == 0: pred_index = "A" @@ -200,17 +263,43 @@ def parse_multi_choice_response(response, all_choices, index2ans): return pred_index -def lemonade_process_results(doc, results): +def lemonade_process_results(doc: dict[str, Any], results: list[Any]) -> dict[str, dict]: + """ + Process the results from the model and compute accuracy. + + Args: + doc: A dictionary representing an entry in the dataset. + results: List of model outputs. + Returns: + A dictionary containing accuracy information. + """ + pred = results[0] - - index2ans, all_choices = get_multi_choice_info(eval(doc["Answers"])) + index2ans, all_choices = get_multi_choice_info(ast.literal_eval(doc["Answers"])) parsed_pred = parse_multi_choice_response(pred, all_choices, index2ans) - acc = {"QID": doc["QID"], "category": doc["Category"], "subcategory": doc["Subcategory"], "difficulty": doc["Difficulty"], "answer": doc["Correct Answer"], "parsed_pred": parsed_pred, "original_pred": pred} + acc = { + "QID": doc["QID"], + "category": doc["Category"], + "subcategory": doc["Subcategory"], + "difficulty": doc["Difficulty"], + "answer": doc["Correct Answer"], + "parsed_pred": parsed_pred, + "original_pred": pred + } return {"acc": acc} -def lemonade_aggregate_results(results): +def lemonade_aggregate_results(results: list[dict[str, Any]]) -> float: + """ + Aggregate the results from the evaluation. + + Args: + results: List of dicts containing individual evaluation results. + Returns: + overall_acc: Overall accuracy. + + """ def compute_accuracy(grouped_results): acc_dict = {} for key, samples in grouped_results.items(): From 50d70dae880005507e4d13d9f77446fb276b6cb0 Mon Sep 17 00:00:00 2001 From: Matea Tashkovska Date: Wed, 10 Sep 2025 08:30:31 +0000 Subject: [PATCH 09/12] download data in cache --- lmms_eval/tasks/lemonade/utils.py | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/lmms_eval/tasks/lemonade/utils.py b/lmms_eval/tasks/lemonade/utils.py index 30ba75241..d4ad7a243 100644 --- a/lmms_eval/tasks/lemonade/utils.py +++ b/lmms_eval/tasks/lemonade/utils.py @@ -2,7 +2,6 @@ import os import cv2 import numpy as np -import yaml import zipfile from collections import defaultdict from PIL import Image @@ -10,14 +9,11 @@ from huggingface_hub import hf_hub_download MAX_NUM_FRAMES = 8 -LEMONADE_ZIP_NAMES = [ - "videos_batch_0.zip", - "videos_batch_1.zip", - "videos_batch_2.zip", - "videos_batch_3.zip", - "videos_batch_4.zip", -] -DEFAULT_DATA_DIR = "./data/lemonade" +LEMONADE_ZIP_NAMES = [f"videos_batch_{i}.zip" for i in range(5)] + +HF_HOME = os.getenv("HF_HOME", "~/.cache/huggingface/") +base_cache_dir = os.path.expanduser(HF_HOME) +videos_dir = os.path.join(base_cache_dir, "videos") def download_and_extract_lemonade_videos(data_dir: str) -> None: """ @@ -30,8 +26,8 @@ def download_and_extract_lemonade_videos(data_dir: str) -> None: """ os.makedirs(data_dir, exist_ok=True) - videos_dir = os.path.join(data_dir, "videos") os.makedirs(videos_dir, exist_ok=True) + print(f"Creating videos directory at {videos_dir}...") for zip_name in LEMONADE_ZIP_NAMES: print(f"Downloading {zip_name} from Hugging Face...") @@ -39,12 +35,13 @@ def download_and_extract_lemonade_videos(data_dir: str) -> None: repo_id="amathislab/LEMONADE", filename=zip_name, repo_type="dataset", - cache_dir=os.path.join(data_dir, "cache") + local_dir=os.path.join(base_cache_dir, "lemonade_zips"), + local_dir_use_symlinks=False, + resume_download=True, ) + with zipfile.ZipFile(zip_path, "r") as zf: + zf.extractall(videos_dir) - with zipfile.ZipFile(zip_path, 'r') as zip_ref: - zip_ref.extractall(videos_dir) - print("All videos downloaded and extracted successfully.\n") def load_video(video_file: str, start_frame: int, end_frame: int, max_num_frames: int = MAX_NUM_FRAMES) -> list[Image.Image]: @@ -112,10 +109,9 @@ def lemonade_doc_to_visual(doc: dict[str, Any]) -> list[Image.Image]: frames: List of PIL Image objects representing sampled frames """ - videos_dir = os.path.join(DEFAULT_DATA_DIR, "videos") if not os.path.exists(videos_dir) or len(os.listdir(videos_dir)) == 0: print("Videos directory is empty — downloading and extracting...\n") - download_and_extract_lemonade_videos(DEFAULT_DATA_DIR) + download_and_extract_lemonade_videos(base_cache_dir) video_filename = doc["Clip"] + "_hololens.mp4" video_path = os.path.join(videos_dir, video_filename) From 615004126e92239f0921544258b84e67d7fc9385 Mon Sep 17 00:00:00 2001 From: Matea Tashkovska Date: Sat, 27 Sep 2025 11:25:50 +0000 Subject: [PATCH 10/12] add README for lemonade --- lmms_eval/tasks/lemonade/README.md | 45 ++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 lmms_eval/tasks/lemonade/README.md diff --git a/lmms_eval/tasks/lemonade/README.md b/lmms_eval/tasks/lemonade/README.md new file mode 100644 index 000000000..518379188 --- /dev/null +++ b/lmms_eval/tasks/lemonade/README.md @@ -0,0 +1,45 @@ +# LEMONADE + +## Task Description + +**LEMONADE** (Language models Evaluation of MOtion aNd Action-Driven Enquiries) is a QA benchmark extracted from the **EPFL-Smart-Kitchen-30** dataset (see [arXiv](https://arxiv.org/abs/2506.01608)). It consists of **36,521 closed-ended QA pairs** linked to egocentric video clips. + +Questions are organized into three groups and six subcategories: + +- **Behavior Understanding** + - *Perception*: recognizing perceived actions + - *Reasoning*: reasoning over unseen behaviors +- **Long-term Understanding** + - *Summarization*: summarizing over longer clips + - *Session Properties*: inferring session-level information +- **Motion & Biomechanics** + - *Physical Attributes*: inferring hand shapes, joint angles, etc. + - *Kinematics*: inferring trajectory velocities + +The benchmark was evaluated using **`lmms-eval`** in the associated publication. + + +## Implementation + +- **utils.py**: Handles data loading from Hugging Face, video loading, answer parsing, and metric evaluation. +- **lemonade.yaml**: Contains the default prompts and evaluation settings. + +When running LEMONADE through `lmms-eval`, the data is automatically downloaded. For direct dataset access, please refer to [Hugging Face](https://huggingface.co/datasets/amathislab/LEMONADE) or [Zenodo](https://zenodo.org/records/15535461). + +Performance is evaluated in terms of accuracy against the ground truth, with results reported overall as well as per category and subcategory. + +## Citation + +If you use **LEMONADE**, please cite: + +```bibtex +@misc{bonnetto2025epflsmartkitchen, + title={EPFL-Smart-Kitchen-30: Densely annotated cooking dataset with 3D kinematics to challenge video and language models}, + author={Andy Bonnetto and Haozhe Qi and Franklin Leong and Matea Tashkovska and Mahdi Rad and Solaiman Shokur and Friedhelm Hummel and Silvestro Micera and Marc Pollefeys and Alexander Mathis}, + year={2025}, + eprint={2506.01608}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2506.01608}, +} +``` \ No newline at end of file From e31fc8ee7c80d7c8c21c4e3a8ee1808d7115c1b4 Mon Sep 17 00:00:00 2001 From: Matea Tashkovska Date: Sat, 27 Sep 2025 13:49:24 +0000 Subject: [PATCH 11/12] remove custom download def, move max_num_frames to config --- lmms_eval/tasks/lemonade/lemonade.yaml | 7 +++- lmms_eval/tasks/lemonade/utils.py | 51 ++++++++------------------ 2 files changed, 21 insertions(+), 37 deletions(-) diff --git a/lmms_eval/tasks/lemonade/lemonade.yaml b/lmms_eval/tasks/lemonade/lemonade.yaml index 22b81d02d..e4263faf5 100644 --- a/lmms_eval/tasks/lemonade/lemonade.yaml +++ b/lmms_eval/tasks/lemonade/lemonade.yaml @@ -1,4 +1,8 @@ dataset_path: amathislab/LEMONADE +dataset_kwargs: + video: true + cache_dir: lemonade_data + force_unzip: true task: "lemonade" test_split: test output_type: generate_until @@ -20,4 +24,5 @@ metric_list: lmms_eval_specific_kwargs: default: pre_prompt: "Answer the following multiple-choice question using the given images.\n" - post_prompt: "\nRespond only with the letter of the correct answer." \ No newline at end of file + post_prompt: "\nRespond only with the letter of the correct answer." + max_num_frames: 8 \ No newline at end of file diff --git a/lmms_eval/tasks/lemonade/utils.py b/lmms_eval/tasks/lemonade/utils.py index d4ad7a243..ad43fc05d 100644 --- a/lmms_eval/tasks/lemonade/utils.py +++ b/lmms_eval/tasks/lemonade/utils.py @@ -3,48 +3,31 @@ import cv2 import numpy as np import zipfile +import yaml from collections import defaultdict +from pathlib import Path from PIL import Image from typing import Any, Optional from huggingface_hub import hf_hub_download -MAX_NUM_FRAMES = 8 -LEMONADE_ZIP_NAMES = [f"videos_batch_{i}.zip" for i in range(5)] +with open(Path(__file__).parent / "lemonade.yaml", "r") as f: + raw_data = f.readlines() + safe_data = [] + for line in raw_data: + if "!function" not in line: + safe_data.append(line) + + config = yaml.safe_load("".join(safe_data)) HF_HOME = os.getenv("HF_HOME", "~/.cache/huggingface/") base_cache_dir = os.path.expanduser(HF_HOME) -videos_dir = os.path.join(base_cache_dir, "videos") +cache_dir = config["dataset_kwargs"]["cache_dir"] +videos_dir = os.path.join(base_cache_dir, cache_dir) -def download_and_extract_lemonade_videos(data_dir: str) -> None: - """ - Download and extract LEMONADE files from Hugging Face into a local data directory. +max_num_frames = config.get("lmms_eval_specific_kwargs", {}).get("max_num_frames", 8) - Args: - data_dir: Directory that stores the files. - Returns: - None - """ - os.makedirs(data_dir, exist_ok=True) - os.makedirs(videos_dir, exist_ok=True) - print(f"Creating videos directory at {videos_dir}...") - - for zip_name in LEMONADE_ZIP_NAMES: - print(f"Downloading {zip_name} from Hugging Face...") - zip_path = hf_hub_download( - repo_id="amathislab/LEMONADE", - filename=zip_name, - repo_type="dataset", - local_dir=os.path.join(base_cache_dir, "lemonade_zips"), - local_dir_use_symlinks=False, - resume_download=True, - ) - with zipfile.ZipFile(zip_path, "r") as zf: - zf.extractall(videos_dir) - - print("All videos downloaded and extracted successfully.\n") - -def load_video(video_file: str, start_frame: int, end_frame: int, max_num_frames: int = MAX_NUM_FRAMES) -> list[Image.Image]: +def load_video(video_file: str, start_frame: int, end_frame: int, max_num_frames: int = max_num_frames) -> list[Image.Image]: """ Args: video_file: Path to the video file. @@ -109,17 +92,13 @@ def lemonade_doc_to_visual(doc: dict[str, Any]) -> list[Image.Image]: frames: List of PIL Image objects representing sampled frames """ - if not os.path.exists(videos_dir) or len(os.listdir(videos_dir)) == 0: - print("Videos directory is empty — downloading and extracting...\n") - download_and_extract_lemonade_videos(base_cache_dir) - video_filename = doc["Clip"] + "_hololens.mp4" video_path = os.path.join(videos_dir, video_filename) if os.path.exists(video_path): start = int(doc["Start"]) end = int(doc["End"]) - frames = load_video(video_path, start, end, max_num_frames=MAX_NUM_FRAMES) + frames = load_video(video_path, start, end, max_num_frames=max_num_frames) else: raise FileNotFoundError( f"Video file not found: {video_path}. " From e891bad9c6b5da481b5aed8e0b4315f0e069e869 Mon Sep 17 00:00:00 2001 From: Matea Tashkovska Date: Sat, 27 Sep 2025 13:52:42 +0000 Subject: [PATCH 12/12] add lemonade to current_tasks --- docs/current_tasks.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/current_tasks.md b/docs/current_tasks.md index 98fe0e319..5d48c4302 100644 --- a/docs/current_tasks.md +++ b/docs/current_tasks.md @@ -245,6 +245,7 @@ python -m lmms_eval --tasks list_with_num - egoschema_mcppl - egoschema_subset_mcppl - egoschema_subset +- [LEMONADE](https://huggingface.co/datasets/amathislab/LEMONADE) (lemonade) - [LongVideoBench](https://github.com/longvideobench/LongVideoBench) - [MovieChat](https://github.com/rese1f/MovieChat) (moviechat) - Global Mode for entire video (moviechat_global)