567-labs · jxnl · Dec 20, 2023 · Dec 21, 2023 · Dec 21, 2023 · Dec 21, 2023
diff --git a/applications/wikipedia/benchmarks.json b/applications/wikipedia/benchmarks.json
@@ -511,3 +511,75 @@
   "characters_per_sec": 23222100,
   "extrapolated_duration": "0:14:02.324293"
 }
+{
+  "downscale": 0.01,
+  "batch_size": 76800,
+  "n_gpu": 100,
+  "duration_mins": 1.0392728034499998,
+  "characters_per_sec": 14384533,
+  "extrapolated_duration": "0:22:39.831352"
+}
+{
+  "downscale": 0.01,
+  "batch_size": 76800,
+  "n_gpu": 100,
+  "duration_mins": 0.7878980491166666,
+  "characters_per_sec": 18973843,
+  "extrapolated_duration": "0:17:10.921303"
+}
+{
+  "downscale": 0.01,
+  "batch_size": 76800,
+  "n_gpu": 100,
+  "duration_mins": 0.8039170896166667,
+  "characters_per_sec": 18595766,
+  "extrapolated_duration": "0:17:31.881324"
+}
+{
+  "downscale": 0.001,
+  "batch_size": 76800,
+  "n_gpu": 100,
+  "duration_mins": 0.6121226231833333,
+  "characters_per_sec": 3817226,
+  "extrapolated_duration": "1:25:24.281077"
+}
+{
+  "downscale": 0.001,
+  "batch_size": 76800,
+  "n_gpu": 100,
+  "duration_mins": 0.584016127,
+  "characters_per_sec": 4000934,
+  "extrapolated_duration": "1:21:28.993159"
+}
+{
+  "downscale": 0.001,
+  "batch_size": 76800,
+  "n_gpu": 100,
+  "duration_mins": 0.5205063897,
+  "characters_per_sec": 4489110,
+  "extrapolated_duration": "1:12:37.331176"
+}
+{
+  "downscale": 0.001,
+  "batch_size": 76800,
+  "n_gpu": 100,
+  "duration_mins": 0.5561301178666667,
+  "characters_per_sec": 4201553,
+  "extrapolated_duration": "1:17:35.549735"
+}
+{
+  "downscale": 0.001,
+  "batch_size": 76800,
+  "n_gpu": 100,
+  "duration_mins": 0.5417734046666667,
+  "characters_per_sec": 4312892,
+  "extrapolated_duration": "1:15:35.364891"
+}
+{
+  "downscale": 1,
+  "batch_size": 76800,
+  "n_gpu": 100,
+  "duration_mins": 14.754031291433334,
+  "characters_per_sec": 21734383,
+  "extrapolated_duration": "0:14:59.981332"
+}
diff --git a/applications/wikipedia/main.py b/applications/wikipedia/main.py
@@ -24,7 +24,7 @@
 data_dir = f"{cache_dir}/{dataset_name}"
 DATA_PATH = Path(data_dir)
 
-PUSH_TO_HUB = False
+SAVE_TO_DISK = True
 dataset_name = f"567-labs/wikipedia-embedding-{MODEL_SLUG}-sample"
 dataset_file = "wiki-embeddings.parquet"
 
@@ -145,8 +145,11 @@ async def embed(self, chunks):
 
 
 @stub.function(
-    image=Image.debian_slim().pip_install("datasets", "pyarrow", "tqdm"),
+    image=Image.debian_slim().pip_install(
+        "datasets", "pyarrow", "tqdm", "hf_transfer", "huggingface_hub"
+    ),
     volumes={cache_dir: volume},
+    _allow_background_volume_commits=True,
     timeout=84600,
     secret=Secret.from_name("huggingface-credentials"),
 )
@@ -186,7 +189,13 @@ def embed_dataset(down_scale: float = 0.005, batch_size: int = 512 * 50):
     start = time.perf_counter()
     acc_chunks = []
     embeddings = []
-    for batch_chunks, batch_embeddings in model.embed.map(batches, order_outputs=False):
+    for resp in model.embed.map(batches, order_outputs=False, return_exceptions=True):
+        if isinstance(resp, Exception):
+            print(f"Exception: {resp}")
+            continue
+
+        batch_chunks, batch_embeddings = resp
+
         acc_chunks.extend(batch_chunks)
         embeddings.extend(batch_embeddings)
 
@@ -207,8 +216,10 @@ def embed_dataset(down_scale: float = 0.005, batch_size: int = 512 * 50):
         "extrapolated_duration": extrapolated_duration_cps_fmt,
     }
 
-    if PUSH_TO_HUB:
-        print(f"Pushing to hub {dataset_name}")
+    print(json.dumps(resp, indent=2))
+
+    if SAVE_TO_DISK:
+        print(f"Creating parquet table...")
         table = pa.Table.from_arrays(
             [
                 pa.array([chunk[0] for chunk in acc_chunks]),  # id
@@ -219,17 +230,17 @@ def embed_dataset(down_scale: float = 0.005, batch_size: int = 512 * 50):
             ],
             names=["id", "url", "title", "text", "embedding"],
         )
-        pq.write_table(table, dataset_file)
-        dataset = load_dataset("parquet", data_files=dataset_file)
-        dataset.push_to_hub(dataset_name, token=os.environ["HUGGINGFACE_TOKEN"])
+        print(f"Saving to disk at {cache_dir}/{dataset_file}")
+        pq.write_table(table, f"{cache_dir}/{dataset_file}")
+        volume.commit()
 
     return resp
 
 
 @stub.local_entrypoint()
 def main():
-    for scale, batch_size in product([0.25], [512 * 50]):
-        with open("benchmarks.json", "a") as f:
-            benchmark = embed_dataset.remote(down_scale=scale, batch_size=batch_size)
-            print(json.dumps(benchmark, indent=2))
-            f.write(json.dumps(benchmark, indent=2) + "\n")
+    scale = 0.01
+    batch_size = 512 * 150
+    with open("benchmarks.json", "a") as f:
+        benchmark = embed_dataset.remote(down_scale=scale, batch_size=batch_size)
+        f.write(json.dumps(benchmark, indent=2) + "\n")
diff --git a/applications/wikipedia/upload.py b/applications/wikipedia/upload.py
@@ -0,0 +1,47 @@
+from pathlib import Path
+
+from modal import Image, Stub, Volume, Secret
+
+MODEL_ID = "BAAI/bge-small-en-v1.5"
+MODEL_SLUG = MODEL_ID.split("/")[-1]
+
+BATCH_SIZE = 512
+DOCKER_IMAGE = (
+    "ghcr.io/huggingface/text-embeddings-inference:86-0.4.0"  # Ampere 86 for A10s.
+    # "ghcr.io/huggingface/text-embeddings-inference:0.4.0" # Ampere 80 for A100s.
+    # "ghcr.io/huggingface/text-embeddings-inference:0.3.0"  # Turing for T4s.
+)
+dataset_name = "wikipedia"
+volume = Volume.persisted("embedding-wikipedia")
+cache_dir = "/data"
+data_dir = f"{cache_dir}/{dataset_name}"
+DATA_PATH = Path(data_dir)
+
+dataset_name = f"567-labs/wikipedia-embedding-{MODEL_SLUG}-sample"
+dataset_file = "wiki-embeddings.parquet"
+
+
+stub = Stub("embeddings")
+
+
+@stub.function(
+    image=Image.debian_slim().pip_install(
+        "datasets", "pyarrow", "tqdm", "hf_transfer", "huggingface_hub"
+    ),
+    volumes={cache_dir: volume},
+    _allow_background_volume_commits=True,
+    timeout=84600,
+    secret=Secret.from_name("huggingface-credentials"),
+)
+def upload_dataset():
+    from datasets import load_dataset
+    import os
+
+    print(f"Pushing to hub {dataset_name}")
+    dataset = load_dataset("parquet", data_files=f"{cache_dir}/{dataset_file}")
+    dataset.push_to_hub(dataset_name, token=os.environ["HUGGINGFACE_TOKEN"])
+
+
+@stub.local_entrypoint()
+def main():
+    upload_dataset.remote()