refactor: Refactor (group-)rule to be lazily evaluated (#64)

delsner · web-flow · commit eb61574f4ef0 · 2025-06-18T18:09:26.000+02:00
diff --git a/dataframely/_base_schema.py b/dataframely/_base_schema.py
@@ -11,9 +11,9 @@
 
 import polars as pl
 
-from ._rule import GroupRule, Rule, with_evaluation_rules
+from ._rule import GroupRule, Rule
 from .columns import Column
-from .exc import ImplementationError, RuleImplementationError
+from .exc import ImplementationError
 
 _COLUMN_ATTR = "__dataframely_columns__"
 _RULE_ATTR = "__dataframely_rules__"
@@ -112,23 +112,6 @@ def __new__(
                         f"which are not in the schema: {missing_list}."
                     )
 
-        # 3) Assuming that non-custom rules are implemented correctly, we check that all
-        # custom rules are _also_ implemented correctly by evaluating rules on an
-        # empty data frame and checking for the evaluated dtypes.
-        if len(result.rules) > 0:
-            lf_empty = pl.LazyFrame(
-                schema={col_name: col.dtype for col_name, col in result.columns.items()}
-            )
-            # NOTE: For some reason, `polars` does not yield correct dtypes when calling
-            #  `collect_schema()`
-            schema = with_evaluation_rules(lf_empty, result.rules).collect().schema
-            for rule_name, rule in result.rules.items():
-                dtype = schema[rule_name]
-                if not isinstance(dtype, pl.Boolean):
-                    raise RuleImplementationError(
-                        rule_name, dtype, isinstance(rule, GroupRule)
-                    )
-
         return super().__new__(mcs, name, bases, namespace, *args, **kwargs)
 
     def __getattribute__(cls, name: str) -> Any:
diff --git a/dataframely/_rule.py b/dataframely/_rule.py
@@ -15,8 +15,15 @@
 class Rule:
     """Internal class representing validation rules."""
 
-    def __init__(self, expr: pl.Expr) -> None:
-        self.expr = expr
+    def __init__(self, expr: pl.Expr | ValidationFunction) -> None:
+        self._expr = expr
+
+    @property
+    def expr(self) -> pl.Expr:
+        """Get the expression of the rule."""
+        if callable(self._expr):
+            return self._expr()
+        return self._expr
 
     def matches(self, other: Rule) -> bool:
         """Check whether this rule semantically matches another rule.
@@ -49,7 +56,9 @@ def __repr__(self) -> str:
 class GroupRule(Rule):
     """Rule that is evaluated on a group of columns."""
 
-    def __init__(self, expr: pl.Expr, group_columns: list[str]) -> None:
+    def __init__(
+        self, expr: pl.Expr | ValidationFunction, group_columns: list[str]
+    ) -> None:
         super().__init__(expr)
         self.group_columns = group_columns
 
@@ -101,8 +110,8 @@ def rule(*, group_by: list[str] | None = None) -> Callable[[ValidationFunction],
 
     def decorator(validation_fn: ValidationFunction) -> Rule:
         if group_by is not None:
-            return GroupRule(expr=validation_fn(), group_columns=group_by)
-        return Rule(expr=validation_fn())
+            return GroupRule(expr=validation_fn, group_columns=group_by)
+        return Rule(expr=validation_fn)
 
     return decorator
 
diff --git a/dataframely/exc.py b/dataframely/exc.py
@@ -3,8 +3,6 @@
 
 from collections import defaultdict
 
-import polars as pl
-
 from ._polars import PolarsDataType
 
 
@@ -108,27 +106,3 @@ def __init__(self, attr: str, kls: type) -> None:
                 "`from __future__ import annotations` in the file that defines the collection."
             )
         super().__init__(message)
-
-
-class RuleImplementationError(ImplementationError):
-    """Error raised when a rule is implemented incorrectly."""
-
-    def __init__(
-        self, name: str, return_dtype: pl.DataType, is_group_rule: bool
-    ) -> None:
-        if is_group_rule:
-            details = (
-                " When implementing a group rule (i.e. when using the `group_by` "
-                "parameter), make sure to use an aggregation function such as `.any()`, "
-                "`.all()`, and others to reduce an expression evaluated on multiple "
-                "rows in the same group to a single boolean value for the group."
-            )
-        else:
-            details = ""
-
-        message = (
-            f"Validation rule '{name}' has not been implemented correctly. It "
-            f"returns dtype '{return_dtype}' but it must return a boolean value."
-            + details
-        )
-        super().__init__(message)
diff --git a/tests/schema/test_rule_implementation.py b/tests/schema/test_rule_implementation.py
@@ -6,7 +6,7 @@
 
 import dataframely as dy
 from dataframely._rule import GroupRule, Rule
-from dataframely.exc import ImplementationError, RuleImplementationError
+from dataframely.exc import ImplementationError
 from dataframely.testing import create_schema
 
 
@@ -29,32 +29,6 @@ def test_group_rule_group_by_error() -> None:
         )
 
 
-def test_rule_implementation_error() -> None:
-    with pytest.raises(
-        RuleImplementationError, match=r"rule 'integer_rule'.*returns dtype 'Int64'"
-    ):
-        create_schema(
-            "test",
-            columns={"a": dy.Integer()},
-            rules={"integer_rule": Rule(pl.col("a") + 1)},
-        )
-
-
-def test_group_rule_implementation_error() -> None:
-    with pytest.raises(
-        RuleImplementationError,
-        match=(
-            r"rule 'b_greater_zero'.*returns dtype 'List\(Boolean\)'.*"
-            r"make sure to use an aggregation function"
-        ),
-    ):
-        create_schema(
-            "test",
-            columns={"a": dy.Integer(), "b": dy.Integer()},
-            rules={"b_greater_zero": GroupRule(pl.col("b") > 0, group_columns=["a"])},
-        )
-
-
 def test_rule_column_overlap_error() -> None:
     with pytest.raises(
         ImplementationError,
diff --git a/tests/schema/test_validate.py b/tests/schema/test_validate.py
@@ -28,6 +28,24 @@ def b_unique_within_a() -> pl.Expr:
         return pl.col("b").n_unique() == 1
 
 
+class MyComplexSchemaWithLazyRules(dy.Schema):
+    a = dy.Int64()
+    b = dy.Int64()
+
+    @dy.rule()
+    def b_greater_a() -> pl.Expr:
+        return MyComplexSchemaWithLazyRules.b.col > MyComplexSchemaWithLazyRules.a.col
+
+    @dy.rule(group_by=["a"])
+    def b_unique_within_a() -> pl.Expr:
+        return (
+            MyComplexSchemaWithLazyRules.b.col.n_unique() == SOME_CONSTANT_DEFINED_LATER
+        )
+
+
+SOME_CONSTANT_DEFINED_LATER = 1
+
+
 # -------------------------------------- COLUMNS ------------------------------------- #
 
 
@@ -119,9 +137,13 @@ def test_success_multi_row_strip_cast(
 
 
 @pytest.mark.parametrize("df_type", [pl.DataFrame, pl.LazyFrame])
-def test_group_rule_on_nulls(df_type: type[pl.DataFrame] | type[pl.LazyFrame]) -> None:
+@pytest.mark.parametrize("schema", [MyComplexSchema, MyComplexSchemaWithLazyRules])
+def test_group_rule_on_nulls(
+    df_type: type[pl.DataFrame] | type[pl.LazyFrame],
+    schema: type[MyComplexSchema] | type[MyComplexSchemaWithLazyRules],
+) -> None:
     # The schema is violated because we have multiple "b" values for the same "a" value
     df = df_type({"a": [None, None], "b": [1, 2]})
     with pytest.raises(RuleValidationError):
-        MyComplexSchema.validate(df, cast=True)
-    assert not MyComplexSchema.is_valid(df, cast=True)
+        schema.validate(df, cast=True)
+    assert not schema.is_valid(df, cast=True)