feat: colbert_vecs

sigridjineth · sigridjineth · commit 8e5e3bd7363a · 2025-09-06T14:36:14.000+09:00
diff --git a/BGEM3TFModel.py b/BGEM3TFModel.py
@@ -32,11 +32,13 @@ def __init__(self, d_model, num_heads, dropout_rate=0.1, **kwargs):
         # 드롭아웃
         self.dropout = tf.keras.layers.Dropout(dropout_rate)
 
-    def stable_softmax(self, logits, axis=None, name=None):
-        """
-        Stable softmax implementation
-        """
-        return tf.nn.softmax(logits=logits + 1e-9, axis=axis, name=name)
+    def stable_softmax(self, logits, axis=-1, name=None):
+        """Numerically stable softmax: subtract max and compute in float32."""
+        dtype = logits.dtype
+        x = tf.cast(logits, tf.float32)
+        x = x - tf.reduce_max(x, axis=axis, keepdims=True)
+        probs = tf.nn.softmax(x, axis=axis, name=name)
+        return tf.cast(probs, dtype)
 
     def split_heads(self, x, batch_size):
         x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
@@ -45,27 +47,29 @@ def split_heads(self, x, batch_size):
     def call(self, inputs, mask=None, training=False):
         batch_size = tf.shape(inputs)[0]
 
-        # Query, Key, Value를 계산
-        q = self.wq(inputs)  # (batch_size, seq_len, d_model)
-        k = self.wk(inputs)  # (batch_size, seq_len, d_model)
-        v = self.wv(inputs)  # (batch_size, seq_len, d_model)
+        # Projections
+        q = self.wq(inputs)
+        k = self.wk(inputs)
+        v = self.wv(inputs)
 
-        # 다중 헤드로 분리
-        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
-        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
-        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
+        # Split heads
+        q = self.split_heads(q, batch_size)
+        k = self.split_heads(k, batch_size)
+        v = self.split_heads(v, batch_size)
 
-        # Scaled Dot-Product Attention
-        sqrt_att_head_size = math.sqrt(self.depth)
-
-        attention_scores = tf.matmul(q, k, transpose_b=True)  # (batch_size, num_heads, seq_len_q, seq_len_k)
-        dk = tf.cast(sqrt_att_head_size, tf.float32)
-        attention_scores = tf.divide(attention_scores, dk)
+        # Scaled dot-product attention (compute in float32 for stability)
+        q_f = tf.cast(q, tf.float32)
+        k_f = tf.cast(k, tf.float32)
+        attention_scores = tf.matmul(q_f, k_f, transpose_b=True)
+        scale = tf.sqrt(tf.cast(self.depth, tf.float32))
+        attention_scores = attention_scores / scale
 
         if mask is not None:
-            attention_scores = tf.add(attention_scores, mask)
+            attention_scores = attention_scores + tf.cast(mask, tf.float32)
 
         attention_probs = self.stable_softmax(attention_scores, axis=-1)
+        # Cast back to v dtype for matmul efficiency under mixed precision
+        attention_probs = tf.cast(attention_probs, v.dtype)
         attention_probs = self.dropout(attention_probs, training=training)
 
         # Attention result
@@ -118,11 +122,23 @@ def __init__(self, model_name, normalize_embeddings=False, use_fp16=True,
         self.num_layers = self.config.num_hidden_layers
         self.vocab_size = self.config.vocab_size
 
+        # Optional mixed precision
+        if self.use_fp16:
+            from tensorflow.keras import mixed_precision
+            try:
+                mixed_precision.set_global_policy("mixed_float16")
+            except Exception:
+                pass
+
         # Build components
         self._build_embeddings()
         self._build_encoder_layers()
         self._build_pooler()
+        # Handle ColBERT dim parameter
+        self.colbert_dim = self.d_model if not colbert_dim or colbert_dim < 1 else int(colbert_dim)
         self._build_colbert()
+        # Sparse head (optional)
+        self.sparse_linear = tf.keras.layers.Dense(1, name="sparse_linear")
 
         # Tokenizer
         self.tokenizer = AutoTokenizer.from_pretrained(
@@ -207,9 +223,7 @@ def _build_pooler(self):
         )
 
     def _build_colbert(self):
-        self.colbert_linear = tf.keras.layers.Dense(
-            units=self.d_model,
-        )
+        self.colbert_linear = tf.keras.layers.Dense(self.colbert_dim, name="colbert_linear")
 
     def call(self, inputs, training=False, output_hidden_states=False):
 
@@ -278,7 +292,10 @@ def call(self, inputs, training=False, output_hidden_states=False):
 
         # Pooling
         if self.pooling_method == "mean":
-            pooled_output = tf.reduce_mean(hidden_states, axis=1)
+            m = tf.cast(attention_mask_origin, tf.float32)[:, :, None]
+            summed = tf.reduce_sum(tf.cast(hidden_states, tf.float32) * m, axis=1)
+            denom = tf.reduce_sum(m, axis=1) + tf.cast(1e-9, tf.float32)
+            pooled_output = tf.cast(summed / denom, hidden_states.dtype)
         else:  # default: cls
             pooled_output = hidden_states[:, 0, :]
 
@@ -291,15 +308,23 @@ def call(self, inputs, training=False, output_hidden_states=False):
             pooled_output = tf.nn.l2_normalize(pooled_output, axis=-1)
 
         ## colbert_vecs
-        colbert_vecs = self.colbert_linear(hidden_states[:, 1:])
-        colbert_vecs = colbert_vecs * tf.cast(attention_mask_origin[:, 1:][:, :, None], dtype=tf.float32)
+        colbert_vecs = None
+        if self.return_colbert_vecs:
+            m = tf.cast(attention_mask_origin[:, 1:], hidden_states.dtype)[:, :, None]
+            colbert_vecs = self.colbert_linear(hidden_states[:, 1:]) * m
 
         outputs = {
             "dense_vecs": pooled_output,
-            "colbert_vecs": colbert_vecs,
             "last_hidden_state": hidden_states
         }
 
+        if colbert_vecs is not None:
+            outputs["colbert_vecs"] = colbert_vecs
+
+        if self.return_sparse:
+            token_weights = tf.nn.relu(self.sparse_linear(hidden_states))
+            outputs["token_weights"] = token_weights
+
         if output_hidden_states:
             outputs["hidden_states"] = all_hidden_states
 
@@ -368,8 +393,6 @@ def save_model_with_tokenizer(model, tokenizer, save_path):
         tf.TensorSpec(shape=[None, None], dtype=tf.int32, name='attention_mask')
     ])
     def serving_fn(input_ids, attention_mask):
-
-        print(input_ids)
         inputs = {
             'input_ids': input_ids,
             'attention_mask': attention_mask
@@ -379,15 +402,24 @@ def serving_fn(input_ids, attention_mask):
 
         if outputs.get('hidden_states'):
             hidden_states = tf.stack(outputs['hidden_states'], axis=0)
-            return {
-                'dense_vecs': outputs['dense_vecs'],  # CLS Token
-                'colbert_vecs': outputs['colbert_vecs'],
+            ret = {
+                'dense_vecs': outputs['dense_vecs'],  # CLS Token or masked mean
                 'hidden_states': hidden_states  # (num_layers, batch, seq_len, hidden_dim)
             }
+            if 'colbert_vecs' in outputs:
+                ret['colbert_vecs'] = outputs['colbert_vecs']
+            if 'token_weights' in outputs:
+                ret['token_weights'] = outputs['token_weights']
+            return ret
         else:
-            return {
+            ret = {
                 'dense_vecs': outputs['dense_vecs'],
             }
+            if 'colbert_vecs' in outputs:
+                ret['colbert_vecs'] = outputs['colbert_vecs']
+            if 'token_weights' in outputs:
+                ret['token_weights'] = outputs['token_weights']
+            return ret
 
     # Save model
     tf.saved_model.save(
diff --git a/BGEM3WeightConverter.py b/BGEM3WeightConverter.py
@@ -55,11 +55,58 @@ def _init_colbert_weights(tf_model):
     colbert = load_colbert_weights()
     colbert_weights = colbert['weight']
     colbert_bias = colbert['bias']
+    # Convert to numpy and report shape
+    w = colbert_weights.detach().cpu().numpy() if hasattr(colbert_weights, "detach") else np.array(colbert_weights)
+    b = colbert_bias.detach().cpu().numpy() if hasattr(colbert_bias, "detach") else np.array(colbert_bias)
 
-    tf_model.colbert_linear.set_weights([
-        colbert_weights.numpy().T,
-        colbert_bias.numpy()
-    ])
+    out_dim, in_dim = w.shape  # PT: (out_dim, in_dim)
+    print(f"ColBERT head weight shape: (out_dim={out_dim}, in_dim={in_dim})")
+
+    # Ensure the Dense layer has matching units and is built
+    try:
+        current_units = getattr(tf_model.colbert_linear, "units", None)
+    except Exception:
+        current_units = None
+
+    if current_units is not None and current_units != out_dim:
+        # Units mismatch; warn. Ideally create the model with detected colbert_dim to avoid this.
+        print(f"Warning: colbert_linear units ({current_units}) != detected out_dim ({out_dim}). We will attempt to set weights and may fail.")
+
+    # Ensure variables exist. If not built yet, do a dummy call to build with correct in_dim.
+    if not getattr(tf_model.colbert_linear, "built", False):
+        dummy = tf.zeros((1, 2, in_dim), dtype=tf.float32)
+        _ = tf_model.colbert_linear(dummy)
+
+    # Set weights (kernel shape: (in_dim, out_dim))
+    tf_model.colbert_linear.set_weights([w.T, b])
+
+
+def _init_sparse_weights(tf_model):
+    """Initialize sparse head weights if available (optional)."""
+    try:
+        st = load_sparse_weights()
+    except FileNotFoundError as e:
+        print(str(e))
+        return
+
+    # Expect PyTorch shape: (out_dim=1, in_dim=hidden)
+    w_pt = st["weight"]
+    b_pt = st["bias"]
+    # Ensure numpy
+    if hasattr(w_pt, "cpu"):
+        w_np = w_pt.cpu().numpy()
+    else:
+        w_np = np.array(w_pt)
+    if hasattr(b_pt, "cpu"):
+        b_np = b_pt.cpu().numpy()
+    else:
+        b_np = np.array(b_pt)
+
+    # Build layer if not built
+    in_dim = w_np.shape[1]
+    tf_model.sparse_linear.build((None, None, in_dim))
+    # Keras Dense kernel shape: (in_dim, out_dim)
+    tf_model.sparse_linear.set_weights([w_np.T, b_np])
 
 
 class BGEM3WeightConverter:
@@ -85,15 +132,15 @@ def initialize_weights(self, tf_model):
         # Initialize encoder layers
         self._init_transformer_blocks(tf_model)
 
-        # Initialize pooler
-        self._init_pooler_weights(tf_model)
-
-        # Initialize pooler
+        # Initialize pooler (once)
         self._init_pooler_weights(tf_model)
 
         # Initialize colbert
         _init_colbert_weights(tf_model)
 
+        # Initialize sparse head (optional)
+        _init_sparse_weights(tf_model)
+
         return tf_model
 
     def _init_embedding_weights(self, tf_model):
@@ -230,9 +277,28 @@ def _init_pooler_weights(self, tf_model):
 
 
 def convert_and_save_model(model_name: str, save_path: str):
-    """Convert PyTorch model to TensorFlow and save"""
-    # Initialize TensorFlow model
-    tf_model = BGEM3TensorFlow(model_name)
+    """Convert PyTorch model to TensorFlow and save.
+    Also detects and uses original ColBERT dimension for TF head.
+    """
+    # Detect ColBERT original dimension from weights (out_dim)
+    try:
+        colbert = load_colbert_weights()
+        colbert_w = colbert['weight']
+        out_dim = int(colbert_w.shape[0])
+        print(f"Detected ColBERT dimension: {out_dim}")
+        colbert_dim = out_dim
+        return_colbert_vecs = True
+    except Exception as e:
+        print(f"ColBERT weights not found or failed to load: {e}")
+        colbert_dim = -1
+        return_colbert_vecs = False
+
+    # Initialize TensorFlow model with detected colbert_dim
+    tf_model = BGEM3TensorFlow(
+        model_name,
+        colbert_dim=colbert_dim,
+        return_colbert_vecs=return_colbert_vecs,
+    )
 
     # Convert weights
     converter = BGEM3WeightConverter(model_name)
diff --git a/model_conversion_validator.py b/model_conversion_validator.py
@@ -132,8 +132,11 @@ def encode_with_tf_model_and_get_hidden_states(serving_fn, tokenizer, queries, m
 
     hidden_states = outputs["hidden_states"]  # (num_layers, batch, seq_len, hidden_dim)
     final_embeddings = outputs["dense_vecs"]
-    print("outputs['colbert_vecs'] : ")
-    print(outputs["colbert_vecs"])
+    if "colbert_vecs" in outputs:
+        print("outputs['colbert_vecs'] : ")
+        print(outputs["colbert_vecs"])
+    else:
+        print("colbert_vecs not returned by TF model (flag disabled).")
 
     return final_embeddings.numpy(), hidden_states
 
diff --git a/torch_tf_validator.py b/torch_tf_validator.py
@@ -61,34 +61,24 @@ def main():
     
     inputs_tf = tokenize_wo_padding(tokenizer, text, return_tensors="tf")
     inputs_tf_w_padding = tokenize_w_padding(tokenizer, text, return_tensors="tf")
-    inputs_tf_w_padding_attnFixed = inputs_tf_w_padding.copy()
-    inputs_tf_w_padding_attnFixed['attention_mask'] = tf.where(inputs_tf_w_padding['attention_mask'] == 0, -9999999, 0)
     tf_model = load_tf_model(model_path_tf).signatures["serving_default"]
 
     loguru.logger.info("Tensorflow] Model output".ljust(50, "-"))
     with tf.device("/GPU:0"):
         output_tf = tf_model(**inputs_tf)
         output_tf_w_padding = tf_model(**inputs_tf_w_padding)
-        output_tf_w_padding_attnFixed = tf_model(**inputs_tf_w_padding_attnFixed)
         loguru.logger.info("output without padding (GT)".ljust(50, "-"))
         loguru.logger.info(output_tf['hidden_states'][-1][:,0])
         loguru.logger.info("="*50)
         loguru.logger.info("output with padding".ljust(50, "-"))
         loguru.logger.info(output_tf_w_padding['hidden_states'][-1][:,0])
         loguru.logger.info("="*50)
-        loguru.logger.info("output with padding (attention fixed)".ljust(50, "-"))
-        loguru.logger.info(output_tf_w_padding_attnFixed['hidden_states'][-1][:,0])
-        loguru.logger.info("="*50)
         err_tf = tf.abs(output_tf['hidden_states'][-1][:,0] - output_tf_w_padding['hidden_states'][-1][:,0])
         loguru.logger.info("Error".ljust(50, "-"))
         loguru.logger.info(tf.reduce_mean(err_tf))
         loguru.logger.info("="*50)
-        err_tf_attnFixed = tf.abs(output_tf_w_padding['hidden_states'][-1][:,0] - output_tf_w_padding_attnFixed['hidden_states'][-1][:,0])
-        loguru.logger.info("Error (attention fixed)".ljust(50, "-"))
-        loguru.logger.info(tf.reduce_mean(err_tf_attnFixed))
-        loguru.logger.info("="*50)
         
 
 
 if __name__ == "__main__":
-    main()
+    main()