Add fallback reward for correct language without thinking blocks

casteryh · casteryh · commit b15f17134afe · 2025-10-31T14:43:31.000-07:00
- Add fallback_reward parameter (default 0.2)
- If no &lt;think&gt; blocks found, check if response text is in target language
- Reward structure:
  * full_reward (1.0): Single block + correct language
  * partial_reward (0.5): Multiple blocks + correct language
  * fallback_reward (0.2): No blocks + correct language in response text
  * no_match_reward (0.0): Wrong language
- Update all tests to reflect new behavior (29 tests passing)
diff --git a/src/forge/data/rewards.py b/src/forge/data/rewards.py
@@ -90,8 +90,10 @@ class LanguageReward:
 
     Args:
         target_language: ISO 639-1 language code (e.g., 'en', 'ja', 'zh', 'es')
-        full_reward: Reward when detected language matches target
-        no_match_reward: Reward when detected language doesn't match target
+        full_reward: Reward when language matches and format is correct (single block)
+        partial_reward: Reward when language matches but format is wrong (multiple blocks)
+        fallback_reward: Reward when no valid blocks but response text is in target language
+        no_match_reward: Reward when language doesn't match
 
     Note: Requires langid to be installed. Install with: pip install langid
     """
@@ -100,10 +102,14 @@ def __init__(
         self,
         target_language: str = "en",
         full_reward: float = 1.0,
+        partial_reward: float = 0.5,
+        fallback_reward: float = 0.2,
         no_match_reward: float = 0.0,
     ):
         self.target_language = target_language
         self.full_reward = full_reward
+        self.partial_reward = partial_reward
+        self.fallback_reward = fallback_reward
         self.no_match_reward = no_match_reward
         self._THINK_BLOCK_RE = re.compile(
             r"<\s*think\s*>(.*?)<\s*/\s*think\s*>", re.IGNORECASE | re.DOTALL
@@ -129,21 +135,38 @@ def __call__(self, prompt: str, response: str, target: str | None = None) -> flo
             target: Optional target string (unused but kept for signature consistency)
 
         Returns:
-            full_reward if detected language matches target_language and format is correct,
-            no_match_reward otherwise (including when format is wrong or no thinking block)
+            full_reward if language matches and exactly one thinking block is found,
+            partial_reward if language matches but multiple thinking blocks found,
+            fallback_reward if no valid blocks but response text is in target language,
+            no_match_reward otherwise (wrong language)
         """
         if not response:
             return self.no_match_reward
 
         # Extract all thinking blocks
         matches = self._THINK_BLOCK_RE.findall(response)
 
-        # Return 0 reward if format is wrong (0 or multiple thinking blocks)
-        if len(matches) != 1:
+        # If no thinking blocks found, check if response text is in target language
+        if len(matches) == 0:
+            # Remove any partial tags that might exist
+            response_text = re.sub(
+                r"<\s*/?\s*think\s*>", "", response, flags=re.IGNORECASE
+            ).strip()
+
+            if not response_text:
+                return self.no_match_reward
+
+            # Detect language of general response
+            detected_lang, confidence = self._langid.classify(response_text)
+
+            # Give fallback reward if response is in target language
+            if detected_lang == self.target_language:
+                return self.fallback_reward
+
             return self.no_match_reward
 
-        # Get the single thinking block content
-        thinking_content = matches[0]
+        # Concatenate all thinking blocks for language detection
+        thinking_content = " ".join(matches)
 
         # Remove extra whitespace
         thinking_content = re.sub(r"\s+", " ", thinking_content).strip()
@@ -154,8 +177,13 @@ def __call__(self, prompt: str, response: str, target: str | None = None) -> flo
         # Detect language using langid
         detected_lang, confidence = self._langid.classify(thinking_content)
 
-        # Return full reward if language matches target
+        # Check if language matches target
         if detected_lang == self.target_language:
-            return self.full_reward
+            # Full reward for correct format (single block)
+            if len(matches) == 1:
+                return self.full_reward
+            # Partial reward for wrong format (multiple blocks) but correct language
+            else:
+                return self.partial_reward
 
         return self.no_match_reward
diff --git a/tests/unit_tests/rl/test_language_reward.py b/tests/unit_tests/rl/test_language_reward.py
@@ -19,23 +19,35 @@ def setUp(self):
         self.reward_en = LanguageReward(target_language="en")
         self.reward_ja = LanguageReward(target_language="ja")
         self.custom_reward = LanguageReward(
-            target_language="ja", full_reward=0.9, no_match_reward=0.1
+            target_language="ja",
+            full_reward=0.9,
+            partial_reward=0.6,
+            fallback_reward=0.3,
+            no_match_reward=0.1,
         )
 
     def test_init_default_values(self):
         """Test LanguageReward initialization with default values."""
         reward = self.LanguageReward()
         self.assertEqual(reward.target_language, "en")
         self.assertEqual(reward.full_reward, 1.0)
+        self.assertEqual(reward.partial_reward, 0.5)
+        self.assertEqual(reward.fallback_reward, 0.2)
         self.assertEqual(reward.no_match_reward, 0.0)
 
     def test_init_custom_values(self):
         """Test LanguageReward initialization with custom values."""
         reward = self.LanguageReward(
-            target_language="ja", full_reward=0.9, no_match_reward=0.1
+            target_language="ja",
+            full_reward=0.9,
+            partial_reward=0.6,
+            fallback_reward=0.3,
+            no_match_reward=0.1,
         )
         self.assertEqual(reward.target_language, "ja")
         self.assertEqual(reward.full_reward, 0.9)
+        self.assertEqual(reward.partial_reward, 0.6)
+        self.assertEqual(reward.fallback_reward, 0.3)
         self.assertEqual(reward.no_match_reward, 0.1)
 
     def test_init_missing_langid(self):
@@ -112,10 +124,17 @@ def test_call_language_mismatch(self):
         self.assertEqual(result, 0.0)
 
     def test_call_with_no_thinking_tags(self):
-        """Test __call__ with response containing no thinking tags."""
+        """Test __call__ with response containing no thinking tags but correct language."""
         result = self.reward_en(
             "prompt", "This is just a regular response without any thinking tags."
         )
+        # No thinking blocks but response is in English, should get fallback reward
+        self.assertEqual(result, 0.2)
+
+    def test_call_with_no_thinking_tags_wrong_language(self):
+        """Test __call__ with response containing no thinking tags and wrong language."""
+        result = self.reward_en("prompt", "これは日本語の応答です。タグはありません。")
+        # No thinking blocks and wrong language, should get no_match_reward
         self.assertEqual(result, 0.0)
 
     def test_call_with_empty_thinking_block(self):
@@ -145,15 +164,15 @@ def test_call_with_whitespace_in_tags(self):
         self.assertEqual(result, 1.0)
 
     def test_call_multiple_thinking_blocks(self):
-        """Test __call__ with multiple thinking blocks (wrong format)."""
+        """Test __call__ with multiple thinking blocks (wrong format but correct language)."""
         response = """
         <think>First thought in English.</think>
         Some text in between.
         <think>Second thought also in English.</think>
         """
         result = self.reward_en("prompt", response)
-        # Multiple blocks = wrong format, should return 0
-        self.assertEqual(result, 0.0)
+        # Multiple blocks = wrong format, but language is correct, should return partial_reward
+        self.assertEqual(result, 0.5)
 
     def test_call_multiple_thinking_blocks_mixed_languages(self):
         """Test __call__ with multiple thinking blocks in different languages (wrong format)."""
@@ -162,8 +181,9 @@ def test_call_multiple_thinking_blocks_mixed_languages(self):
         <think>これは短い日本語。</think>
         """
         result = self.reward_en("prompt", response)
-        # Multiple blocks = wrong format, should return 0
-        self.assertEqual(result, 0.0)
+        # Multiple blocks with mixed languages - langid will detect dominant language
+        # Should return either partial_reward (if detects English) or no_match_reward (if detects Japanese)
+        self.assertIn(result, [0.0, 0.5])
 
     def test_call_multiline_thinking_block(self):
         """Test __call__ with multiline thinking blocks."""
@@ -192,20 +212,31 @@ def test_call_with_target_parameter(self):
         result = self.reward_en("prompt", response, target="some target")
         self.assertEqual(result, 1.0)
 
-        result = self.reward_en("prompt", "no tags", target="some target")
-        self.assertEqual(result, 0.0)
+        # Longer English text without tags should get fallback reward
+        result = self.reward_en(
+            "prompt",
+            "This is a response without thinking tags but in English language.",
+            target="some target",
+        )
+        self.assertEqual(result, 0.2)
 
     def test_call_custom_reward_values(self):
         """Test __call__ with custom reward values."""
-        response_ja = "<think>これは日本語です。</think>"
+        response_ja_single = "<think>これは日本語です。</think>"
+        response_ja_multiple = "<think>最初の考え。</think><think>次の考え。</think>"
+        response_ja_no_tags = "これはタグなしの日本語です。"
         response_en = "<think>This is English.</think>"
-        response_none = "no thinking tags"
-
-        # Test custom full reward
-        self.assertEqual(self.custom_reward("prompt", response_ja), 0.9)
-        # Test custom no_match reward
+        response_none = ""
+
+        # Test custom full reward (single block, correct language)
+        self.assertEqual(self.custom_reward("prompt", response_ja_single), 0.9)
+        # Test custom partial reward (multiple blocks, correct language)
+        self.assertEqual(self.custom_reward("prompt", response_ja_multiple), 0.6)
+        # Test custom fallback reward (no blocks, correct language)
+        self.assertEqual(self.custom_reward("prompt", response_ja_no_tags), 0.3)
+        # Test custom no_match reward (wrong language)
         self.assertEqual(self.custom_reward("prompt", response_en), 0.1)
-        # Test no tags
+        # Test empty response
         self.assertEqual(self.custom_reward("prompt", response_none), 0.1)
 
     def test_call_zero_custom_values(self):