Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions scripts/DEEPSEEK_R1_ON_GAUDI.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,22 @@ ray start --address='${head_ip}:6379' --resources='{"HPU": 8, "TPU": 0}'
python scripts/run_example_tp_2nodes.py --model ${YOUR_PATH}/DeepSeek-R1-static
```

# Requantize the Official FP8 Model Using INC
- INC: https://github.com/yiliu30/vllm-fork/tree/r1-woq

- Calibration
```bash
export OFFICIAL_FP8_MODEL=deepseek-ai/DeepSeek-R1
# For quick test
VLLM_REQUANT_FP8_INC=1 QUANT_CONFIG=inc_measure_with_fp8kv_config.json VLLM_ENABLE_RUNTIME_DEQUANT=1 python run_example_tp.py --model ${OFFICIAL_FP8_MODEL} --tokenizer ${OFFICIAL_FP8_MODEL} --osl 32 --max_num_seqs 1
# For calibration with pile dataset
VLLM_REQUANT_FP8_INC=1 QUANT_CONFIG=inc_measure_with_fp8kv_config.json VLLM_ENABLE_RUNTIME_DEQUANT=1 python run_example_tp.py --model ${OFFICIAL_FP8_MODEL} --tokenizer ${OFFICIAL_FP8_MODEL} --osl 32 --max_num_seqs 1 --nprompts 512 --dataset pile
```
- Quantizatiion
```bash
VLLM_REQUANT_FP8_INC=1 QUANT_CONFIG=inc_quant_with_fp8kv_config.json VLLM_ENABLE_RUNTIME_DEQUANT=1 python run_example_tp.py --model ${OFFICIAL_FP8_MODEL} --tokenizer ${OFFICIAL_FP8_MODEL} --max_num_seqs 1 --fp8_kv_cache

- Evaluation
```bash
VLLM_REQUANT_FP8_INC=1 QUANT_CONFIG=inc_quant_with_fp8kv_config.json VLLM_ENABLE_RUNTIME_DEQUANT=1 python run_lm_eval.py --model ${OFFICIAL_FP8_MODEL} --tokenizer ${OFFICIAL_FP8_MODEL} --fp8_kv_cache -l 64 --batch_size 1
```
15 changes: 15 additions & 0 deletions scripts/inc_measure_with_fp8kv_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"method": "HOOKS",
"mode": "MEASURE",
"observer": "maxabs",
"whitelist": {
"types": [],
"names": []
},
"blocklist": {
"types": [],
"names": ["lm_head", "mlp\\.gate\\b"]
},
"quantize_weight": false,
"dump_stats_path": "./nc_workspace_measure_kvache/inc_measure_output"
}
14 changes: 14 additions & 0 deletions scripts/inc_quant_with_fp8kv_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"mode": "QUANTIZE",
"observer": "maxabs",
"scale_method": "maxabs_hw",
"allowlist": {
"types": [],
"names": []
},
"blocklist": {
"types": [],
"names": ["lm_head", "mlp\\.gate\\b"]
},
"dump_stats_path": "./nc_workspace_measure_kvache/inc_measure_output"
}
27 changes: 26 additions & 1 deletion scripts/run_example_tp.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
parser.add_argument("--isl", type=int, default=1024, help="input sequence length.")
parser.add_argument("--osl", type=int, default=1024, help="output sequence length.")
parser.add_argument("--nprompts", type=int, default=4, help="The number of prompts.")
parser.add_argument("--max_num_seqs", type=int, default=None, help="The max number of sequences.")
parser.add_argument("--random", action="store_true", help="Randomly sample prompts.")
parser.add_argument("--fp8_kv_cache", action="store_true", help="Use fp8 for kv cache.")
args = parser.parse_args()
Expand All @@ -36,8 +37,12 @@
os.environ["VLLM_EP_SIZE"] = f"{args.ep_size}"
os.environ["VLLM_MLA_DISABLE_REQUANTIZATION"] = "1"
os.environ["PT_HPU_WEIGHT_SHARING"] = "0"
os.environ["VLLM_LOGGING_LEVEL"] = "DEBUG"
#os.environ['VLLM_DMOE_DYNAMIC_SCALE']='1' # only works for 1.20 + dmoe patch




def sample_sonnet_requests(
dataset_path: str,
num_requests: int,
Expand Down Expand Up @@ -160,6 +165,16 @@ def sample_gsm8k_requests(
tokenizer=tokenizer,
do_random=args.random,
)
elif args.dataset == "pile":
from utils import get_prompts, get_prompt_token_ids, get_pile_prompts
least_tokens = args.isl
num_samples = args.nprompts
prompts = get_pile_prompts(args.model, num_samples)
prompt_token_ids = get_prompt_token_ids(
args.model, prompts, least_tokens
)
print(f"Got {len(prompts)} prompts, length of first prompt: {len(prompt_token_ids[0])}.")
gt = None
else:
prompts = [
"Hello, my name is",
Expand All @@ -178,6 +193,8 @@ def sample_gsm8k_requests(
param = {}
if args.fp8_kv_cache:
param["kv_cache_dtype"] = "fp8_inc"
if args.max_num_seqs is not None:
param["max_num_seqs"] = args.max_num_seqs
if args.tp_size == 1:
llm = LLM(
model=model,
Expand All @@ -201,10 +218,16 @@ def sample_gsm8k_requests(
**param
)


# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
start = time.perf_counter()
outputs = llm.generate(prompts, sampling_params)
if args.dataset == "pile":
outputs = llm.generate(
prompts=None, sampling_params=sampling_params, prompt_token_ids=prompt_token_ids
)
else:
outputs = llm.generate(prompts, sampling_params)
end = time.perf_counter()
# Print the outputs.
print(f"e2e took {end - start} seconds")
Expand All @@ -218,4 +241,6 @@ def sample_gsm8k_requests(
print(f"Generated text: {generated_text!r}")
print(f"Ground truth: {gt_i!r}")
print("====================================")
if os.getenv("VLLM_FORCE_INC", None) is not None:
llm.llm_engine.model_executor.shutdown()
del llm
32 changes: 28 additions & 4 deletions scripts/run_lm_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@
parser.add_argument("--tokenizer", type=str, default=None, help="The model path.")
parser.add_argument("--tp_size", type=int, default=8, help="Tensor Parallelism size.")
parser.add_argument("--ep_size", type=int, default=8, help="Expert Parallelism size.")
parser.add_argument("-l", "--limit", type=int, default=64, help="test request counts.")
parser.add_argument("-l", "--limit", type=int, default=None, help="test request counts.")
parser.add_argument("--batch_size", type=int, default=1, help="The batch size.")
parser.add_argument("--fp8_kv_cache", action="store_true", help="Use fp8 for kv cache.")
args = parser.parse_args()

os.environ["VLLM_SKIP_WARMUP"] = "true"
Expand All @@ -36,6 +37,16 @@
#os.environ['VLLM_DMOE_DYNAMIC_SCALE']='1'
#os.environ['VLLM_ENABLE_RUNTIME_DEQUANT']='1'

if args.task == "gsm8k":
#For testing gsm8k quickly
os.environ['VLLM_PROMPT_BS_BUCKET_MIN']='1'
os.environ['VLLM_PROMPT_BS_BUCKET_MAX']='1'
os.environ['VLLM_PROMPT_SEQ_BUCKET_MIN']='2048'
os.environ['VLLM_PROMPT_SEQ_BUCKET_STEP']='512'
os.environ['VLLM_PROMPT_SEQ_BUCKET_MAX']='2048'
os.environ['VLLM_DECODE_BS_BUCKET_MIN']='1'
os.environ['VLLM_DECODE_BS_BUCKET_MAX']='1'

if __name__ == "__main__":

from lm_eval.models.vllm_causallms import VLLM
Expand All @@ -44,6 +55,9 @@
model = args.model
if args.tokenizer is None:
args.tokenizer = model
param = {}
if args.fp8_kv_cache:
param["kv_cache_dtype"] = "fp8_inc"
if args.tp_size == 1:
llm = VLLM(
pretrained=model,
Expand All @@ -65,17 +79,27 @@
dtype="bfloat16",
gpu_memory_utilization=0.8,
batch_size=args.batch_size,
**param,
)


# Run the evaluation; you can adjust num_fewshot and batch_size as needed.
start = time.perf_counter()
if args.task == "gsm8k":
results = simple_evaluate(model=llm, tasks=["gsm8k"], num_fewshot=5, batch_size=8, limit=args.limit)
from lm_eval.utils import make_table

results = simple_evaluate(
model=llm,
tasks=["gsm8k"],
# num_fewshot=5,
# batch_size=8,
limit=args.limit,
)
end = time.perf_counter()
e2e = end - start
print(make_table(results))
# save as json
with open(f"gsm8k_ep{args.ep_size}_result_samples_limit{args.limit}.jsonl", "w") as f:
with open(f"gsm8k_ep{args.ep_size}_result_samples_limit{str(args.limit)}.jsonl", "w") as f:
json.dump(results['results'], f)
json.dump({"e2e time(secs)": e2e}, f)
f.write("\n")
Expand All @@ -86,7 +110,7 @@
results = simple_evaluate(model=llm, tasks=["hellaswag"], num_fewshot=0, batch_size=8, limit=args.limit)
end = time.perf_counter()
e2e = end - start
with open(f"hallaswag_ep{args.ep_size}_result_samples_limit{args.limit}.jsonl", "w") as f:
with open(f"hallaswag_ep{args.ep_size}_result_samples_limit{str(args.limit)}.jsonl", "w") as f:
json.dump(results['results'], f)
json.dump({"e2e time(secs)": e2e}, f)
f.write("\n")
Expand Down
148 changes: 148 additions & 0 deletions scripts/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
from transformers import PreTrainedTokenizerBase, AutoTokenizer
from typing import List, Dict, Any

from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
from typing import List, Dict, Any
import json

import random

def reset_seed(seed=42):
import torch
import random
import numpy as np
print("Using seed: ", seed)
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed) # multi-GPU.
# TODO: for future use
# torch.backends.cudnn.benchmark = False
# torch.backends.cudnn.deterministic = True

def get_prompts():
filename = "pile.txt"
with open(filename, "r") as f:
prompts = f.readlines()
print(f"Number of prompts: {len(prompts)}")
return prompts


def get_prompt_token_ids(model_path, prompts, max_length=1024):
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_path)
prompt_token_ids = []
for prompt in prompts:
tokens = tokenizer(
prompt,
return_tensors="pt",
truncation=True,
max_length=max_length,
)
if len(tokens.input_ids[0]) < max_length:
continue
prompt_token_ids.append([x.item() for x in tokens.input_ids[0]])
return prompt_token_ids


def get_pile_prompts(model_name, num_samples=512):
from datasets import load_dataset
from tqdm import tqdm
import transformers

"""
autoround calibration static model:
NeelNanda/pile-10k,seed=42, iters=1 rtn, nsamples=512 seqlen=1024
"""

# ==-------------------------------------------------------------------------==
# Calibration parameters
least_tokens = 1024
seed = 42
# ==-------------------------------------------------------------------------==

reset_seed(seed)

dataset = load_dataset("NeelNanda/pile-10k", split="train")
dataset = dataset.shuffle(seed=seed)

tokenizer = transformers.AutoTokenizer.from_pretrained(
model_name, trust_remote_code=True
)
num_sample = 0
samples_lst = []
for data in tqdm(dataset):
prompt = data["text"]
tokens = tokenizer(prompt, return_tensors="pt")
if len(tokens.input_ids[0]) < least_tokens:
continue
num_sample += 1
samples_lst.append(prompt)
if num_sample >= num_samples:
break
return samples_lst

#==-------------------------------------------------------------------------==
# Load custom dataset
#==-------------------------------------------------------------------------==

def get_dataset(filepath: str) -> List[List[Dict[str, str]]]:
"""
[
[
{"role": "system", "content": "system prompt"},
{"role": "user", "content": "query prompt"},
],
[
{"role": "system", "content": "1. 角色设定:- 你是...."},
{"role": "user", "content": "搜索关键词】\n梁斌是谁,做什么"},
],
...
]

"""
with open(filepath) as f:
dataset: List[List[Dict[str, str]]] = [json.loads(line) for line in f]
return dataset


def sample_tc_requests(
filepath: str,
tokenizer: PreTrainedTokenizerBase,
num_requests: int = None,
do_random: bool = False,
) -> List[str]:
dataset = get_dataset(filepath)
prompts = dataset
few_shots = 0
sampled_requests: List[str] = []
if num_requests is None:
num_requests = len(prompts)
for j in range(num_requests):
i = (
random.choice(range(len(prompts[few_shots:])))
if do_random
else j + few_shots
)
# message demo:
# [
# {"role": "system", "content": "1. 角色设定:- 你是...."},
# {"role": "user", "content": "搜索关键词】\n梁斌是谁,做什么"},
# ],
message: List[Dict[str, str]] = prompts[i]
prompt_with_template = tokenizer.apply_chat_template(
message, add_generation_prompt=True, tokenize=False
)
sampled_requests.append(prompt_with_template)

return sampled_requests

def get_tokenizer(model_path) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
return tokenizer


Loading