Skip to content

Commit 62c35e2

Browse files
authored
Merge pull request #1 from sionic-ai/attention_mask
report: tf-torch mismatch
2 parents 30bc004 + 47c799d commit 62c35e2

File tree

8 files changed

+1663
-12
lines changed

8 files changed

+1663
-12
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
model/
2+
*.zip
3+
__pycache__/

.python-version

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.11

BGEM3WeightConverter.py

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from BGEM3TFModel import BGEM3TensorFlow, save_model_with_tokenizer
2+
from huggingface_hub import hf_hub_download
23

34
from transformers import AutoModel
45
import numpy as np
@@ -19,13 +20,33 @@ def load_sparse_weights():
1920

2021

2122
def load_colbert_weights():
22-
script_dir = os.path.dirname(os.path.abspath(__file__))
23-
model_path = os.path.join(script_dir, './bge-m3', 'colbert_linear.pt')
24-
if not os.path.exists(model_path):
25-
raise FileNotFoundError(f"FileNotFoundError: {model_path}")
23+
repo_id = "BAAI/bge-m3"
24+
filename = "colbert_linear.pt"
2625

2726
device = 'cuda' if torch.cuda.is_available() else 'cpu'
28-
return torch.load(model_path, map_location=device, weights_only=True)
27+
28+
try:
29+
hf_model_path = hf_hub_download(
30+
repo_id=repo_id,
31+
filename=filename,
32+
)
33+
print(f"Loaded from Hugging Face: {hf_model_path}")
34+
35+
model = torch.load(hf_model_path, map_location=device)
36+
37+
except Exception as e:
38+
print("Failed to load from Hugging Face. Reason:", e)
39+
40+
script_dir = os.path.dirname(os.path.abspath(__file__))
41+
local_model_path = os.path.join(script_dir, 'bge-m3', 'colbert_linear.pt')
42+
43+
if not os.path.exists(local_model_path):
44+
raise FileNotFoundError(f"FileNotFoundError: {local_model_path}")
45+
46+
print(f"Loaded from local: {local_model_path}")
47+
model = torch.load(local_model_path, map_location=device)
48+
49+
return model
2950

3051

3152
def _init_colbert_weights(tf_model):
@@ -225,7 +246,7 @@ def convert_and_save_model(model_name: str, save_path: str):
225246

226247

227248
if __name__ == "__main__":
228-
model_name = "./bge-m3"
249+
model_name = "BAAI/bge-m3"
229250
save_path = "./converted_bge_m3"
230251

231252
tf_model = convert_and_save_model(model_name, save_path)

README.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,21 +43,22 @@ git clone https://huggingface.co/BAAI/bge-m3
4343

4444
3. Set up Python virtual environment:
4545
```bash
46-
python3.11 -m venv venv
46+
uv venv
4747
source ./venv/bin/activate
48-
pip install -r requirements.txt
48+
uv sync
49+
# uv add -r requirements.txt
4950
```
5051

5152
## Usage
5253

5354
1. Convert the model weights:
5455
```bash
55-
python BGEM3WeightConverter.py
56+
uv run BGEM3WeightConverter.py
5657
```
5758

5859
2. Validate the converted model:
5960
```bash
60-
python model_conversion_validator.py
61+
uv run model_conversion_validator.py
6162
```
6263

6364
## Model Architecture

model_conversion_validator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -261,11 +261,11 @@ def compare_layer_outputs1(pt_all_layer_outputs, tf_all_layer_outputs):
261261

262262
def main():
263263
# 경로 설정 (예: ./bge-m3, ./converted_bge_m3)
264-
model_name_or_path = "./bge-m3" # PyTorch 원본
264+
model_name_or_path = "BAAI/bge-m3" # PyTorch 원본
265265
saved_model_dir = "./converted_bge_m3" # TF 변환본
266266

267267
queries = [
268-
"이 모델은 무엇을 하는 모델인가요?",
268+
"이 모델은 무엇을 하는 모델인가요?이 모델은 무엇을 하는 모델인가요?이 모델은 무엇을 하는 모델인가요?이 모델은 무엇을 하는 모델인가요?이 모델은 무엇을 하는 모델인가요?이 모델은 무엇을 하는 모델인가요?",
269269
"이 모델은 무엇을 하는 모델인가요?"
270270
]
271271

pyproject.toml

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
[project]
2+
name = "bge-m3-model-converter"
3+
version = "0.1.0"
4+
description = "Add your description here"
5+
readme = "README.md"
6+
requires-python = ">=3.11"
7+
dependencies = [
8+
"absl-py==2.1.0",
9+
"astunparse==1.6.3",
10+
"certifi==2024.12.14",
11+
"charset-normalizer==3.4.1",
12+
"filelock==3.16.1",
13+
"flatbuffers==24.12.23",
14+
"fsspec==2024.12.0",
15+
"gast==0.6.0",
16+
"gdown>=5.2.0",
17+
"google-pasta==0.2.0",
18+
"grpcio==1.68.1",
19+
"h5py==3.12.1",
20+
"huggingface-hub==0.27.0",
21+
"idna==3.10",
22+
"jinja2==3.1.5",
23+
"keras==3.7.0",
24+
"libclang==18.1.1",
25+
"loguru>=0.7.3",
26+
"markdown==3.7",
27+
"markdown-it-py==3.0.0",
28+
"markupsafe==3.0.2",
29+
"mdurl==0.1.2",
30+
"ml-dtypes==0.3.2",
31+
"mpmath==1.3.0",
32+
"namex==0.0.8",
33+
"networkx==3.4.2",
34+
"numpy==1.26.4",
35+
"nvitop>=1.4.0",
36+
"opt-einsum==3.4.0",
37+
"optree==0.13.1",
38+
"packaging==24.2",
39+
"protobuf==4.25.5",
40+
"pygments==2.18.0",
41+
"pyyaml==6.0.2",
42+
"regex==2024.11.6",
43+
"requests==2.32.3",
44+
"rich==13.9.4",
45+
"safetensors==0.4.5",
46+
"six==1.17.0",
47+
"sympy==1.13.1",
48+
"tensorboard==2.16.2",
49+
"tensorboard-data-server==0.7.2",
50+
"tensorflow==2.16.2",
51+
"tensorflow-io-gcs-filesystem==0.37.1",
52+
"termcolor==2.5.0",
53+
"tf-keras==2.16.0",
54+
"tokenizers==0.21.0",
55+
"torch==2.5.1",
56+
"torchaudio>=2.5.1",
57+
"torchvision>=0.20.1",
58+
"tqdm==4.67.1",
59+
"transformers==4.47.1",
60+
"typing-extensions==4.12.2",
61+
"urllib3==2.3.0",
62+
"werkzeug==3.1.3",
63+
"wrapt==1.17.0",
64+
]

torch_tf_validator.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import loguru
2+
3+
from transformers import AutoTokenizer, AutoModel
4+
import tensorflow as tf
5+
import torch
6+
7+
def load_torch_model(model_path):
8+
model = AutoModel.from_pretrained(model_path)
9+
return model
10+
11+
12+
def load_tf_model(model_path):
13+
with tf.device("/CPU:0"):
14+
model = tf.saved_model.load(model_path)
15+
return model
16+
17+
18+
def load_tokenizer(model_path):
19+
tokenizer = AutoTokenizer.from_pretrained(model_path)
20+
return tokenizer
21+
22+
23+
def tokenize_wo_padding(tokenizer, text, return_tensors="pt"):
24+
return tokenizer(text, padding=False, return_tensors=return_tensors)
25+
26+
27+
def tokenize_w_padding(tokenizer, text, return_tensors="pt", max_length=512):
28+
return tokenizer(text, padding="max_length", max_length=max_length, return_tensors=return_tensors)
29+
30+
31+
def main():
32+
# Load the model
33+
model_path = "BAAI/bge-m3"
34+
model_path_tf = "/workspace/BGE-M3-Model-Converter/model"
35+
model = load_torch_model(model_path)
36+
tokenizer = load_tokenizer(model_path)
37+
38+
# Tokenize the text
39+
text = "Hello, my dog is cute"
40+
inputs = tokenize_wo_padding(tokenizer, text)
41+
inputs_w_padding = tokenize_w_padding(tokenizer, text)
42+
43+
# Get the output from the model
44+
loguru.logger.info("Torch] Model output".ljust(50, "-"))
45+
model.eval().to("cuda")
46+
with torch.no_grad():
47+
inputs = {k: v.to("cuda") for k, v in inputs.items()}
48+
inputs_w_padding = {k: v.to("cuda") for k, v in inputs_w_padding.items()}
49+
50+
output = model(**inputs)
51+
output_w_padding = model(**inputs_w_padding)
52+
loguru.logger.info("output without padding (GT)".ljust(50, "-"))
53+
loguru.logger.info(output['last_hidden_state'][:, 0])
54+
loguru.logger.info("="*50)
55+
loguru.logger.info("output with padding".ljust(50, "-"))
56+
loguru.logger.info(output_w_padding['last_hidden_state'][:, 0])
57+
loguru.logger.info("="*50)
58+
err = torch.abs(output['last_hidden_state'][:, 0] - output_w_padding['last_hidden_state'][:, 0])
59+
loguru.logger.info("Error".ljust(50, "-"))
60+
loguru.logger.info(err.mean())
61+
62+
inputs_tf = tokenize_wo_padding(tokenizer, text, return_tensors="tf")
63+
inputs_tf_w_padding = tokenize_w_padding(tokenizer, text, return_tensors="tf")
64+
inputs_tf_w_padding_attnFixed = inputs_tf_w_padding.copy()
65+
inputs_tf_w_padding_attnFixed['attention_mask'] = tf.where(inputs_tf_w_padding['attention_mask'] == 0, -9999999, 0)
66+
tf_model = load_tf_model(model_path_tf).signatures["serving_default"]
67+
68+
loguru.logger.info("Tensorflow] Model output".ljust(50, "-"))
69+
with tf.device("/GPU:0"):
70+
output_tf = tf_model(**inputs_tf)
71+
output_tf_w_padding = tf_model(**inputs_tf_w_padding)
72+
output_tf_w_padding_attnFixed = tf_model(**inputs_tf_w_padding_attnFixed)
73+
loguru.logger.info("output without padding (GT)".ljust(50, "-"))
74+
loguru.logger.info(output_tf['hidden_states'][-1][:,0])
75+
loguru.logger.info("="*50)
76+
loguru.logger.info("output with padding".ljust(50, "-"))
77+
loguru.logger.info(output_tf_w_padding['hidden_states'][-1][:,0])
78+
loguru.logger.info("="*50)
79+
loguru.logger.info("output with padding (attention fixed)".ljust(50, "-"))
80+
loguru.logger.info(output_tf_w_padding_attnFixed['hidden_states'][-1][:,0])
81+
loguru.logger.info("="*50)
82+
err_tf = tf.abs(output_tf['hidden_states'][-1][:,0] - output_tf_w_padding['hidden_states'][-1][:,0])
83+
loguru.logger.info("Error".ljust(50, "-"))
84+
loguru.logger.info(tf.reduce_mean(err_tf))
85+
loguru.logger.info("="*50)
86+
err_tf_attnFixed = tf.abs(output_tf_w_padding['hidden_states'][-1][:,0] - output_tf_w_padding_attnFixed['hidden_states'][-1][:,0])
87+
loguru.logger.info("Error (attention fixed)".ljust(50, "-"))
88+
loguru.logger.info(tf.reduce_mean(err_tf_attnFixed))
89+
loguru.logger.info("="*50)
90+
91+
92+
93+
if __name__ == "__main__":
94+
main()

0 commit comments

Comments
 (0)