diff --git a/README.md b/README.md index f068681..8b09354 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,21 @@ # SimpleQA Evaluator -A tool for evaluating and comparing different Question Answering (QA) policies on OpenAI's SimpleQA benchmark, measuring metrics like f-score, accuracy, and latency. +A tool for evaluating and comparing different Question Answering (QA) policies on OpenAI's SimpleQA +benchmark, measuring metrics like f-score, accuracy, and latency. ## Features -The evaluator provides comprehensive testing capabilities for AI search engines including Linkup Deep, Linkup Standard, and Tavily APIs. It supports both single-policy evaluation and head-to-head comparisons, with built-in async processing and progress tracking. +The evaluator provides comprehensive testing capabilities for AI search engines including Linkup Deep, +Linkup Standard, Tavily and Perplexity APIs. +It supports both single-policy evaluation and head-to-head comparisons, with built-in async processing +and progress tracking. ## Setup 1. Install dependencies: ```bash -pip install pandas tqdm python-dotenv linkup-sdk tavily-python openai +pip install pandas tqdm python-dotenv linkup-sdk tavily-python openai requests ``` 2. Create a `.env` file with your API keys: @@ -19,6 +23,7 @@ pip install pandas tqdm python-dotenv linkup-sdk tavily-python openai LINKUP_API_KEY=your_linkup_key TAVILY_API_KEY=your_tavily_key OPENAI_API_KEY=your_openai_key +PERPLEXITY_API_KEY=your_perplexity_key ``` 3. Ensure you have the `simple_qa_test_set.csv` file containing the SimpleQA benchmark in your project directory. @@ -47,6 +52,7 @@ The evaluator currently supports three QA policies: - Linkup API (deep search mode) - Linkup API (standard search mode) - Tavily API (advanced search mode) +- Perplexity API (you can specify desired model) ## Output and Metrics diff --git a/clients/__init__.py b/clients/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/clients/perplexity_.py b/clients/perplexity_.py new file mode 100644 index 0000000..c778a79 --- /dev/null +++ b/clients/perplexity_.py @@ -0,0 +1,31 @@ +from typing import Optional +import os +import requests + + +class PerplexityClient: + + def __init__(self, api_key: Optional[str] = None, + base_url: str = "https://api.perplexity.ai/chat/completions"): + if api_key is None: + api_key = os.getenv("PERPLEXITY_API_KEY") + if not api_key: + raise ValueError("The Perplexity API key was not provided") + self.api_key = api_key + self.base_url = base_url + + def search(self, query: str, model: str = "sonar-pro", max_tokens: int = 300) -> str: + payload = { + "model": model, + "messages": [ + {"role": "user", "content": query} + ], + "max_tokens": max_tokens + } + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json" + } + response = requests.post(self.base_url, json=payload, headers=headers) + result = response.json() + return result["choices"][0]["message"]["content"] diff --git a/eval.py b/eval.py index 8c51334..93af4e9 100644 --- a/eval.py +++ b/eval.py @@ -7,6 +7,7 @@ import threading import time import traceback + from asyncio import Semaphore from concurrent.futures import ThreadPoolExecutor from datetime import datetime @@ -15,6 +16,8 @@ import pandas as pd from dotenv import load_dotenv + +from clients.perplexity_ import PerplexityClient from grader import grade_sample from linkup import LinkupClient from tavily import TavilyClient @@ -22,9 +25,6 @@ load_dotenv() -linkup_api_key = os.getenv("LINKUP_API_KEY") -tavily_api_key = os.getenv("TAVILY_API_KEY") - def get_data(): df = pd.read_csv("simple_qa_test_set.csv") @@ -53,15 +53,17 @@ def print_log(policy_type: str, message: str): async def run_linkup_policy( - question: str, - policy_args: dict[str, Any] | None, + question: str, + policy_args: dict[str, Any] | None, ) -> Tuple[str, None]: - """Run linkup policy in a thread to avoid blocking.""" + """Run linkup policy in a thread to avoid blocking. + Provide policy_args for local run as follows: + policy_args = {"api_key": 'LOCAL_API_KEY', "base_url": 'LOCAL_BASE_URL'}""" loop = asyncio.get_event_loop() with ThreadPoolExecutor() as pool: result = await loop.run_in_executor( pool, - lambda: LinkupClient(api_key=linkup_api_key, **policy_args or dict()) + lambda: LinkupClient(**policy_args or dict()) .search(question, depth="deep", output_type="sourcedAnswer") .answer, ) @@ -69,15 +71,15 @@ async def run_linkup_policy( async def run_linkup_standard_policy( - question: str, - policy_args: dict[str, Any] | None, + question: str, + policy_args: dict[str, Any] | None, ) -> Tuple[str, None]: """Run linkup policy in a thread to avoid blocking.""" loop = asyncio.get_event_loop() with ThreadPoolExecutor() as pool: result = await loop.run_in_executor( pool, - lambda: LinkupClient(api_key=linkup_api_key, **policy_args or dict()) + lambda: LinkupClient(**policy_args or dict()) .search(question, depth="standard", output_type="sourcedAnswer") .answer, ) @@ -85,31 +87,48 @@ async def run_linkup_standard_policy( async def run_tavily_policy( - question: str, - policy_args: dict[str, Any] | None, + question: str, + policy_args: dict[str, Any] | None, ) -> Tuple[str, None]: """Run tavily policy in a thread to avoid blocking.""" loop = asyncio.get_event_loop() with ThreadPoolExecutor() as pool: result = await loop.run_in_executor( pool, - lambda: TavilyClient(api_key=tavily_api_key, **policy_args or dict()).search( + lambda: TavilyClient(**policy_args or dict()).search( question, search_depth="advanced", include_answer=True )["answer"], ) return result, None +async def run_perplexity_policy( + question: str, + policy_args: dict[str, Any] | None, +) -> Tuple[str, None]: + """Run perplexity sonar pro policy in a thread to avoid blocking.""" + loop = asyncio.get_event_loop() + with ThreadPoolExecutor() as pool: + result = await loop.run_in_executor( + pool, + lambda: PerplexityClient(**policy_args or dict()).search( + question + ), + ) + return result, None + + async def run_policy_async( - question: str, - policy_type: str = "linkup", - policy_args: dict[str, Any] | None = None, + question: str, + policy_type: str = "linkup", + policy_args: dict[str, Any] | None = None, ) -> Tuple[str, Optional[Any]]: """Async version of run_policy.""" policy_handlers = { "tavily": run_tavily_policy, "linkup": run_linkup_policy, "linkup_standard": run_linkup_standard_policy, + "perplexity": run_perplexity_policy, } if policy_type not in policy_handlers: raise ValueError(f"Unknown policy type: {policy_type}") @@ -138,10 +157,10 @@ def calculate_f_score(metrics: Dict[str, float]) -> float: """ if (metrics["accuracy_given_attempted"] + metrics["is_correct"]) > 0: return ( - 2 - * metrics["accuracy_given_attempted"] - * metrics["is_correct"] - / (metrics["accuracy_given_attempted"] + metrics["is_correct"]) + 2 + * metrics["accuracy_given_attempted"] + * metrics["is_correct"] + / (metrics["accuracy_given_attempted"] + metrics["is_correct"]) ) return 0.0 @@ -252,14 +271,14 @@ async def compare_policies(policy1: str, policy2: str, num_samples: int): def generate_question_id(question: str) -> str: """Generate a unique, deterministic ID for a question.""" return hashlib.sha256(question.encode()).hexdigest()[ - :16 - ] # First 16 chars of hash is sufficient + :16 + ] # First 16 chars of hash is sufficient async def evaluate_questions_async( - questions_df: pd.DataFrame, - policy_type: str, - policy_args: dict[str, Any] | None, + questions_df: pd.DataFrame, + policy_type: str, + policy_args: dict[str, Any] | None, ) -> list: """Evaluate questions and return results.""" sem = Semaphore(MAX_CONCURRENT_TASKS) @@ -406,12 +425,12 @@ def analyze_results(results_file: Path): async def compare_policies_async( - policy1: str, - policy1_args: dict[str, Any] | None, - policy2: str, - policy2_args: dict[str, Any] | None, - num_samples: int, - seed: int, + policy1: str, + policy1_args: dict[str, Any] | None, + policy2: str, + policy2_args: dict[str, Any] | None, + num_samples: int, + seed: int, ) -> None: """Compare two policies on the same set of questions.""" questions_df = sample_questions(n=num_samples, seed=seed) @@ -492,7 +511,7 @@ def save_summary(self, metrics: dict): ) parser.add_argument( "--policy1", - choices=["linkup", "linkup_standard", "tavily"], + choices=["linkup", "linkup_standard", "tavily", "perplexity"], help="First (or only) policy to evaluate", ) parser.add_argument( @@ -503,7 +522,7 @@ def save_summary(self, metrics: dict): ) parser.add_argument( "--policy2", - choices=["linkup", "linkup_standard", "tavily"], + choices=["linkup", "linkup_standard", "tavily", "perplexity"], help="Second policy to compare against (only in compare mode)", ) parser.add_argument( @@ -527,6 +546,7 @@ def save_summary(self, metrics: dict): args = parser.parse_args() + async def main(): try: if args.mode == "evaluate": @@ -559,6 +579,7 @@ async def main(): traceback.print_exc() sys.exit(1) + if args.analyze: # Analyze existing results analyze_results(Path(args.analyze))