Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,24 +1,29 @@
# SimpleQA Evaluator

A tool for evaluating and comparing different Question Answering (QA) policies on OpenAI's SimpleQA benchmark, measuring metrics like f-score, accuracy, and latency.
A tool for evaluating and comparing different Question Answering (QA) policies on OpenAI's SimpleQA
benchmark, measuring metrics like f-score, accuracy, and latency.

## Features

The evaluator provides comprehensive testing capabilities for AI search engines including Linkup Deep, Linkup Standard, and Tavily APIs. It supports both single-policy evaluation and head-to-head comparisons, with built-in async processing and progress tracking.
The evaluator provides comprehensive testing capabilities for AI search engines including Linkup Deep,
Linkup Standard, Tavily and Perplexity APIs.
It supports both single-policy evaluation and head-to-head comparisons, with built-in async processing
and progress tracking.

## Setup

1. Install dependencies:

```bash
pip install pandas tqdm python-dotenv linkup-sdk tavily-python openai
pip install pandas tqdm python-dotenv linkup-sdk tavily-python openai requests
```

2. Create a `.env` file with your API keys:
```
LINKUP_API_KEY=your_linkup_key
TAVILY_API_KEY=your_tavily_key
OPENAI_API_KEY=your_openai_key
PERPLEXITY_API_KEY=your_perplexity_key
```

3. Ensure you have the `simple_qa_test_set.csv` file containing the SimpleQA benchmark in your project directory.
Expand Down Expand Up @@ -47,6 +52,7 @@ The evaluator currently supports three QA policies:
- Linkup API (deep search mode)
- Linkup API (standard search mode)
- Tavily API (advanced search mode)
- Perplexity API (you can specify desired model)

## Output and Metrics

Expand Down
Empty file added clients/__init__.py
Empty file.
31 changes: 31 additions & 0 deletions clients/perplexity_.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from typing import Optional
import os
import requests


class PerplexityClient:

def __init__(self, api_key: Optional[str] = None,
base_url: str = "https://api.perplexity.ai/chat/completions"):
if api_key is None:
api_key = os.getenv("PERPLEXITY_API_KEY")
if not api_key:
raise ValueError("The Perplexity API key was not provided")
self.api_key = api_key
self.base_url = base_url

def search(self, query: str, model: str = "sonar-pro", max_tokens: int = 300) -> str:
payload = {
"model": model,
"messages": [
{"role": "user", "content": query}
],
"max_tokens": max_tokens
}
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}
response = requests.post(self.base_url, json=payload, headers=headers)
result = response.json()
return result["choices"][0]["message"]["content"]
87 changes: 54 additions & 33 deletions eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import threading
import time
import traceback

from asyncio import Semaphore
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
Expand All @@ -15,16 +16,15 @@

import pandas as pd
from dotenv import load_dotenv

from clients.perplexity_ import PerplexityClient
from grader import grade_sample
from linkup import LinkupClient
from tavily import TavilyClient
from tqdm import tqdm

load_dotenv()

linkup_api_key = os.getenv("LINKUP_API_KEY")
tavily_api_key = os.getenv("TAVILY_API_KEY")


def get_data():
df = pd.read_csv("simple_qa_test_set.csv")
Expand Down Expand Up @@ -53,63 +53,82 @@ def print_log(policy_type: str, message: str):


async def run_linkup_policy(
question: str,
policy_args: dict[str, Any] | None,
question: str,
policy_args: dict[str, Any] | None,
) -> Tuple[str, None]:
"""Run linkup policy in a thread to avoid blocking."""
"""Run linkup policy in a thread to avoid blocking.
Provide policy_args for local run as follows:
policy_args = {"api_key": 'LOCAL_API_KEY', "base_url": 'LOCAL_BASE_URL'}"""
loop = asyncio.get_event_loop()
with ThreadPoolExecutor() as pool:
result = await loop.run_in_executor(
pool,
lambda: LinkupClient(api_key=linkup_api_key, **policy_args or dict())
lambda: LinkupClient(**policy_args or dict())
.search(question, depth="deep", output_type="sourcedAnswer")
.answer,
)
return result, None


async def run_linkup_standard_policy(
question: str,
policy_args: dict[str, Any] | None,
question: str,
policy_args: dict[str, Any] | None,
) -> Tuple[str, None]:
"""Run linkup policy in a thread to avoid blocking."""
loop = asyncio.get_event_loop()
with ThreadPoolExecutor() as pool:
result = await loop.run_in_executor(
pool,
lambda: LinkupClient(api_key=linkup_api_key, **policy_args or dict())
lambda: LinkupClient(**policy_args or dict())
.search(question, depth="standard", output_type="sourcedAnswer")
.answer,
)
return result, None


async def run_tavily_policy(
question: str,
policy_args: dict[str, Any] | None,
question: str,
policy_args: dict[str, Any] | None,
) -> Tuple[str, None]:
"""Run tavily policy in a thread to avoid blocking."""
loop = asyncio.get_event_loop()
with ThreadPoolExecutor() as pool:
result = await loop.run_in_executor(
pool,
lambda: TavilyClient(api_key=tavily_api_key, **policy_args or dict()).search(
lambda: TavilyClient(**policy_args or dict()).search(
question, search_depth="advanced", include_answer=True
)["answer"],
)
return result, None


async def run_perplexity_policy(
question: str,
policy_args: dict[str, Any] | None,
) -> Tuple[str, None]:
"""Run perplexity sonar pro policy in a thread to avoid blocking."""
loop = asyncio.get_event_loop()
with ThreadPoolExecutor() as pool:
result = await loop.run_in_executor(
pool,
lambda: PerplexityClient(**policy_args or dict()).search(
question
),
)
return result, None


async def run_policy_async(
question: str,
policy_type: str = "linkup",
policy_args: dict[str, Any] | None = None,
question: str,
policy_type: str = "linkup",
policy_args: dict[str, Any] | None = None,
) -> Tuple[str, Optional[Any]]:
"""Async version of run_policy."""
policy_handlers = {
"tavily": run_tavily_policy,
"linkup": run_linkup_policy,
"linkup_standard": run_linkup_standard_policy,
"perplexity": run_perplexity_policy,
}
if policy_type not in policy_handlers:
raise ValueError(f"Unknown policy type: {policy_type}")
Expand Down Expand Up @@ -138,10 +157,10 @@ def calculate_f_score(metrics: Dict[str, float]) -> float:
"""
if (metrics["accuracy_given_attempted"] + metrics["is_correct"]) > 0:
return (
2
* metrics["accuracy_given_attempted"]
* metrics["is_correct"]
/ (metrics["accuracy_given_attempted"] + metrics["is_correct"])
2
* metrics["accuracy_given_attempted"]
* metrics["is_correct"]
/ (metrics["accuracy_given_attempted"] + metrics["is_correct"])
)
return 0.0

Expand Down Expand Up @@ -252,14 +271,14 @@ async def compare_policies(policy1: str, policy2: str, num_samples: int):
def generate_question_id(question: str) -> str:
"""Generate a unique, deterministic ID for a question."""
return hashlib.sha256(question.encode()).hexdigest()[
:16
] # First 16 chars of hash is sufficient
:16
] # First 16 chars of hash is sufficient


async def evaluate_questions_async(
questions_df: pd.DataFrame,
policy_type: str,
policy_args: dict[str, Any] | None,
questions_df: pd.DataFrame,
policy_type: str,
policy_args: dict[str, Any] | None,
) -> list:
"""Evaluate questions and return results."""
sem = Semaphore(MAX_CONCURRENT_TASKS)
Expand Down Expand Up @@ -406,12 +425,12 @@ def analyze_results(results_file: Path):


async def compare_policies_async(
policy1: str,
policy1_args: dict[str, Any] | None,
policy2: str,
policy2_args: dict[str, Any] | None,
num_samples: int,
seed: int,
policy1: str,
policy1_args: dict[str, Any] | None,
policy2: str,
policy2_args: dict[str, Any] | None,
num_samples: int,
seed: int,
) -> None:
"""Compare two policies on the same set of questions."""
questions_df = sample_questions(n=num_samples, seed=seed)
Expand Down Expand Up @@ -492,7 +511,7 @@ def save_summary(self, metrics: dict):
)
parser.add_argument(
"--policy1",
choices=["linkup", "linkup_standard", "tavily"],
choices=["linkup", "linkup_standard", "tavily", "perplexity"],
help="First (or only) policy to evaluate",
)
parser.add_argument(
Expand All @@ -503,7 +522,7 @@ def save_summary(self, metrics: dict):
)
parser.add_argument(
"--policy2",
choices=["linkup", "linkup_standard", "tavily"],
choices=["linkup", "linkup_standard", "tavily", "perplexity"],
help="Second policy to compare against (only in compare mode)",
)
parser.add_argument(
Expand All @@ -527,6 +546,7 @@ def save_summary(self, metrics: dict):

args = parser.parse_args()


async def main():
try:
if args.mode == "evaluate":
Expand Down Expand Up @@ -559,6 +579,7 @@ async def main():
traceback.print_exc()
sys.exit(1)


if args.analyze:
# Analyze existing results
analyze_results(Path(args.analyze))
Expand Down