Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions applications/wikipedia/benchmarks.json
Original file line number Diff line number Diff line change
Expand Up @@ -511,3 +511,75 @@
"characters_per_sec": 23222100,
"extrapolated_duration": "0:14:02.324293"
}
{
"downscale": 0.01,
"batch_size": 76800,
"n_gpu": 100,
"duration_mins": 1.0392728034499998,
"characters_per_sec": 14384533,
"extrapolated_duration": "0:22:39.831352"
}
{
"downscale": 0.01,
"batch_size": 76800,
"n_gpu": 100,
"duration_mins": 0.7878980491166666,
"characters_per_sec": 18973843,
"extrapolated_duration": "0:17:10.921303"
}
{
"downscale": 0.01,
"batch_size": 76800,
"n_gpu": 100,
"duration_mins": 0.8039170896166667,
"characters_per_sec": 18595766,
"extrapolated_duration": "0:17:31.881324"
}
{
"downscale": 0.001,
"batch_size": 76800,
"n_gpu": 100,
"duration_mins": 0.6121226231833333,
"characters_per_sec": 3817226,
"extrapolated_duration": "1:25:24.281077"
}
{
"downscale": 0.001,
"batch_size": 76800,
"n_gpu": 100,
"duration_mins": 0.584016127,
"characters_per_sec": 4000934,
"extrapolated_duration": "1:21:28.993159"
}
{
"downscale": 0.001,
"batch_size": 76800,
"n_gpu": 100,
"duration_mins": 0.5205063897,
"characters_per_sec": 4489110,
"extrapolated_duration": "1:12:37.331176"
}
{
"downscale": 0.001,
"batch_size": 76800,
"n_gpu": 100,
"duration_mins": 0.5561301178666667,
"characters_per_sec": 4201553,
"extrapolated_duration": "1:17:35.549735"
}
{
"downscale": 0.001,
"batch_size": 76800,
"n_gpu": 100,
"duration_mins": 0.5417734046666667,
"characters_per_sec": 4312892,
"extrapolated_duration": "1:15:35.364891"
}
{
"downscale": 1,
"batch_size": 76800,
"n_gpu": 100,
"duration_mins": 14.754031291433334,
"characters_per_sec": 21734383,
"extrapolated_duration": "0:14:59.981332"
}
37 changes: 24 additions & 13 deletions applications/wikipedia/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
data_dir = f"{cache_dir}/{dataset_name}"
DATA_PATH = Path(data_dir)

PUSH_TO_HUB = False
SAVE_TO_DISK = True
dataset_name = f"567-labs/wikipedia-embedding-{MODEL_SLUG}-sample"
dataset_file = "wiki-embeddings.parquet"

Expand Down Expand Up @@ -145,8 +145,11 @@ async def embed(self, chunks):


@stub.function(
image=Image.debian_slim().pip_install("datasets", "pyarrow", "tqdm"),
image=Image.debian_slim().pip_install(
"datasets", "pyarrow", "tqdm", "hf_transfer", "huggingface_hub"
),
volumes={cache_dir: volume},
_allow_background_volume_commits=True,
timeout=84600,
secret=Secret.from_name("huggingface-credentials"),
)
Expand Down Expand Up @@ -186,7 +189,13 @@ def embed_dataset(down_scale: float = 0.005, batch_size: int = 512 * 50):
start = time.perf_counter()
acc_chunks = []
embeddings = []
for batch_chunks, batch_embeddings in model.embed.map(batches, order_outputs=False):
for resp in model.embed.map(batches, order_outputs=False, return_exceptions=True):
if isinstance(resp, Exception):
print(f"Exception: {resp}")
continue

batch_chunks, batch_embeddings = resp

acc_chunks.extend(batch_chunks)
embeddings.extend(batch_embeddings)

Expand All @@ -207,8 +216,10 @@ def embed_dataset(down_scale: float = 0.005, batch_size: int = 512 * 50):
"extrapolated_duration": extrapolated_duration_cps_fmt,
}

if PUSH_TO_HUB:
print(f"Pushing to hub {dataset_name}")
print(json.dumps(resp, indent=2))

if SAVE_TO_DISK:
print(f"Creating parquet table...")
table = pa.Table.from_arrays(
[
pa.array([chunk[0] for chunk in acc_chunks]), # id
Expand All @@ -219,17 +230,17 @@ def embed_dataset(down_scale: float = 0.005, batch_size: int = 512 * 50):
],
names=["id", "url", "title", "text", "embedding"],
)
pq.write_table(table, dataset_file)
dataset = load_dataset("parquet", data_files=dataset_file)
dataset.push_to_hub(dataset_name, token=os.environ["HUGGINGFACE_TOKEN"])
print(f"Saving to disk at {cache_dir}/{dataset_file}")
pq.write_table(table, f"{cache_dir}/{dataset_file}")
volume.commit()

return resp


@stub.local_entrypoint()
def main():
for scale, batch_size in product([0.25], [512 * 50]):
with open("benchmarks.json", "a") as f:
benchmark = embed_dataset.remote(down_scale=scale, batch_size=batch_size)
print(json.dumps(benchmark, indent=2))
f.write(json.dumps(benchmark, indent=2) + "\n")
scale = 0.01
batch_size = 512 * 150
with open("benchmarks.json", "a") as f:
benchmark = embed_dataset.remote(down_scale=scale, batch_size=batch_size)
f.write(json.dumps(benchmark, indent=2) + "\n")
47 changes: 47 additions & 0 deletions applications/wikipedia/upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from pathlib import Path

from modal import Image, Stub, Volume, Secret

MODEL_ID = "BAAI/bge-small-en-v1.5"
MODEL_SLUG = MODEL_ID.split("/")[-1]

BATCH_SIZE = 512
DOCKER_IMAGE = (
"ghcr.io/huggingface/text-embeddings-inference:86-0.4.0" # Ampere 86 for A10s.
# "ghcr.io/huggingface/text-embeddings-inference:0.4.0" # Ampere 80 for A100s.
# "ghcr.io/huggingface/text-embeddings-inference:0.3.0" # Turing for T4s.
)
dataset_name = "wikipedia"
volume = Volume.persisted("embedding-wikipedia")
cache_dir = "/data"
data_dir = f"{cache_dir}/{dataset_name}"
DATA_PATH = Path(data_dir)

dataset_name = f"567-labs/wikipedia-embedding-{MODEL_SLUG}-sample"
dataset_file = "wiki-embeddings.parquet"


stub = Stub("embeddings")


@stub.function(
image=Image.debian_slim().pip_install(
"datasets", "pyarrow", "tqdm", "hf_transfer", "huggingface_hub"
),
volumes={cache_dir: volume},
_allow_background_volume_commits=True,
timeout=84600,
secret=Secret.from_name("huggingface-credentials"),
)
def upload_dataset():
from datasets import load_dataset
import os

print(f"Pushing to hub {dataset_name}")
dataset = load_dataset("parquet", data_files=f"{cache_dir}/{dataset_file}")
dataset.push_to_hub(dataset_name, token=os.environ["HUGGINGFACE_TOKEN"])


@stub.local_entrypoint()
def main():
upload_dataset.remote()