LinkupPlatform · vlapparov · Apr 17, 2025 · Apr 17, 2025 · Apr 23, 2025 · Apr 24, 2025
diff --git a/README.md b/README.md
@@ -1,24 +1,29 @@
 # SimpleQA Evaluator
 
-A tool for evaluating and comparing different Question Answering (QA) policies on OpenAI's SimpleQA benchmark, measuring metrics like f-score, accuracy, and latency.
+A tool for evaluating and comparing different Question Answering (QA) policies on OpenAI's SimpleQA 
+benchmark, measuring metrics like f-score, accuracy, and latency.
 
 ## Features
 
-The evaluator provides comprehensive testing capabilities for AI search engines including Linkup Deep, Linkup Standard, and Tavily APIs. It supports both single-policy evaluation and head-to-head comparisons, with built-in async processing and progress tracking.
+The evaluator provides comprehensive testing capabilities for AI search engines including Linkup Deep, 
+Linkup Standard, Tavily and Perplexity APIs. 
+It supports both single-policy evaluation and head-to-head comparisons, with built-in async processing 
+and progress tracking.
 
 ## Setup
 
 1. Install dependencies:
 
 ```bash
-pip install pandas tqdm python-dotenv linkup-sdk tavily-python openai
+pip install pandas tqdm python-dotenv linkup-sdk tavily-python openai requests
 ```
 
 2. Create a `.env` file with your API keys:
 ```
 LINKUP_API_KEY=your_linkup_key
 TAVILY_API_KEY=your_tavily_key
 OPENAI_API_KEY=your_openai_key
+PERPLEXITY_API_KEY=your_perplexity_key
 ```
 
 3. Ensure you have the `simple_qa_test_set.csv` file containing the SimpleQA benchmark in your project directory.
@@ -47,6 +52,7 @@ The evaluator currently supports three QA policies:
 - Linkup API (deep search mode)
 - Linkup API (standard search mode)
 - Tavily API (advanced search mode)
+- Perplexity API (you can specify desired model)
 
 ## Output and Metrics
 

diff --git a/clients/__init__.py b/clients/__init__.py
diff --git a/clients/perplexity_.py b/clients/perplexity_.py
@@ -0,0 +1,31 @@
+from typing import Optional
+import os
+import requests
+
+
+class PerplexityClient:
+
+    def __init__(self, api_key: Optional[str] = None,
+                 base_url: str = "https://api.perplexity.ai/chat/completions"):
+        if api_key is None:
+            api_key = os.getenv("PERPLEXITY_API_KEY")
+        if not api_key:
+            raise ValueError("The Perplexity API key was not provided")
+        self.api_key = api_key
+        self.base_url = base_url
+
+    def search(self, query: str, model: str = "sonar-pro", max_tokens: int = 300) -> str:
+        payload = {
+            "model": model,
+            "messages": [
+                {"role": "user", "content": query}
+            ],
+            "max_tokens": max_tokens
+        }
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json"
+        }
+        response = requests.post(self.base_url, json=payload, headers=headers)
+        result = response.json()
+        return result["choices"][0]["message"]["content"]
diff --git a/eval.py b/eval.py
@@ -7,6 +7,7 @@
 import threading
 import time
 import traceback
+
 from asyncio import Semaphore
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
@@ -15,16 +16,15 @@
 
 import pandas as pd
 from dotenv import load_dotenv
+
+from clients.perplexity_ import PerplexityClient
 from grader import grade_sample
 from linkup import LinkupClient
 from tavily import TavilyClient
 from tqdm import tqdm
 
 load_dotenv()
 
-linkup_api_key = os.getenv("LINKUP_API_KEY")
-tavily_api_key = os.getenv("TAVILY_API_KEY")
-
 
 def get_data():
     df = pd.read_csv("simple_qa_test_set.csv")
@@ -53,63 +53,82 @@ def print_log(policy_type: str, message: str):
 
 
 async def run_linkup_policy(
-    question: str,
-    policy_args: dict[str, Any] | None,
+        question: str,
+        policy_args: dict[str, Any] | None,
 ) -> Tuple[str, None]:
-    """Run linkup policy in a thread to avoid blocking."""
+    """Run linkup policy in a thread to avoid blocking.
+    Provide policy_args for local run as follows:
+    policy_args = {"api_key": 'LOCAL_API_KEY', "base_url": 'LOCAL_BASE_URL'}"""
     loop = asyncio.get_event_loop()
     with ThreadPoolExecutor() as pool:
         result = await loop.run_in_executor(
             pool,
-            lambda: LinkupClient(api_key=linkup_api_key, **policy_args or dict())
+            lambda: LinkupClient(**policy_args or dict())
             .search(question, depth="deep", output_type="sourcedAnswer")
             .answer,
         )
         return result, None
 
 
 async def run_linkup_standard_policy(
-    question: str,
-    policy_args: dict[str, Any] | None,
+        question: str,
+        policy_args: dict[str, Any] | None,
 ) -> Tuple[str, None]:
     """Run linkup policy in a thread to avoid blocking."""
     loop = asyncio.get_event_loop()
     with ThreadPoolExecutor() as pool:
         result = await loop.run_in_executor(
             pool,
-            lambda: LinkupClient(api_key=linkup_api_key, **policy_args or dict())
+            lambda: LinkupClient(**policy_args or dict())
             .search(question, depth="standard", output_type="sourcedAnswer")
             .answer,
         )
         return result, None
 
 
 async def run_tavily_policy(
-    question: str,
-    policy_args: dict[str, Any] | None,
+        question: str,
+        policy_args: dict[str, Any] | None,
 ) -> Tuple[str, None]:
     """Run tavily policy in a thread to avoid blocking."""
     loop = asyncio.get_event_loop()
     with ThreadPoolExecutor() as pool:
         result = await loop.run_in_executor(
             pool,
-            lambda: TavilyClient(api_key=tavily_api_key, **policy_args or dict()).search(
+            lambda: TavilyClient(**policy_args or dict()).search(
                 question, search_depth="advanced", include_answer=True
             )["answer"],
         )
         return result, None
 
 
+async def run_perplexity_policy(
+        question: str,
+        policy_args: dict[str, Any] | None,
+) -> Tuple[str, None]:
+    """Run perplexity sonar pro policy in a thread to avoid blocking."""
+    loop = asyncio.get_event_loop()
+    with ThreadPoolExecutor() as pool:
+        result = await loop.run_in_executor(
+            pool,
+            lambda: PerplexityClient(**policy_args or dict()).search(
+                question
+            ),
+        )
+        return result, None
+
+
 async def run_policy_async(
-    question: str,
-    policy_type: str = "linkup",
-    policy_args: dict[str, Any] | None = None,
+        question: str,
+        policy_type: str = "linkup",
+        policy_args: dict[str, Any] | None = None,
 ) -> Tuple[str, Optional[Any]]:
     """Async version of run_policy."""
     policy_handlers = {
         "tavily": run_tavily_policy,
         "linkup": run_linkup_policy,
         "linkup_standard": run_linkup_standard_policy,
+        "perplexity": run_perplexity_policy,
     }
     if policy_type not in policy_handlers:
         raise ValueError(f"Unknown policy type: {policy_type}")
@@ -138,10 +157,10 @@ def calculate_f_score(metrics: Dict[str, float]) -> float:
     """
     if (metrics["accuracy_given_attempted"] + metrics["is_correct"]) > 0:
         return (
-            2
-            * metrics["accuracy_given_attempted"]
-            * metrics["is_correct"]
-            / (metrics["accuracy_given_attempted"] + metrics["is_correct"])
+                2
+                * metrics["accuracy_given_attempted"]
+                * metrics["is_correct"]
+                / (metrics["accuracy_given_attempted"] + metrics["is_correct"])
         )
     return 0.0
 
@@ -252,14 +271,14 @@ async def compare_policies(policy1: str, policy2: str, num_samples: int):
 def generate_question_id(question: str) -> str:
     """Generate a unique, deterministic ID for a question."""
     return hashlib.sha256(question.encode()).hexdigest()[
-        :16
-    ]  # First 16 chars of hash is sufficient
+           :16
+           ]  # First 16 chars of hash is sufficient
 
 
 async def evaluate_questions_async(
-    questions_df: pd.DataFrame,
-    policy_type: str,
-    policy_args: dict[str, Any] | None,
+        questions_df: pd.DataFrame,
+        policy_type: str,
+        policy_args: dict[str, Any] | None,
 ) -> list:
     """Evaluate questions and return results."""
     sem = Semaphore(MAX_CONCURRENT_TASKS)
@@ -406,12 +425,12 @@ def analyze_results(results_file: Path):
 
 
 async def compare_policies_async(
-    policy1: str,
-    policy1_args: dict[str, Any] | None,
-    policy2: str,
-    policy2_args: dict[str, Any] | None,
-    num_samples: int,
-    seed: int,
+        policy1: str,
+        policy1_args: dict[str, Any] | None,
+        policy2: str,
+        policy2_args: dict[str, Any] | None,
+        num_samples: int,
+        seed: int,
 ) -> None:
     """Compare two policies on the same set of questions."""
     questions_df = sample_questions(n=num_samples, seed=seed)
@@ -492,7 +511,7 @@ def save_summary(self, metrics: dict):
     )
     parser.add_argument(
         "--policy1",
-        choices=["linkup", "linkup_standard", "tavily"],
+        choices=["linkup", "linkup_standard", "tavily", "perplexity"],
         help="First (or only) policy to evaluate",
     )
     parser.add_argument(
@@ -503,7 +522,7 @@ def save_summary(self, metrics: dict):
     )
     parser.add_argument(
         "--policy2",
-        choices=["linkup", "linkup_standard", "tavily"],
+        choices=["linkup", "linkup_standard", "tavily", "perplexity"],
         help="Second policy to compare against (only in compare mode)",
     )
     parser.add_argument(
@@ -527,6 +546,7 @@ def save_summary(self, metrics: dict):
 
     args = parser.parse_args()
 
+
     async def main():
         try:
             if args.mode == "evaluate":
@@ -559,6 +579,7 @@ async def main():
             traceback.print_exc()
             sys.exit(1)
 
+
     if args.analyze:
         # Analyze existing results
         analyze_results(Path(args.analyze))