datahub-project · sgomezvillamor · Oct 31, 2025
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -219,6 +219,13 @@ def test_connection(config_dict: dict) -> TestConnectionReport:
         return BigQueryTestConnection.test_connection(config_dict)
 
     def _init_schema_resolver(self) -> SchemaResolver:
+        """
+        The ininitialization of SchemaResolver prefetches all existing urns and schemas in the env/platform/instance.
+        Because of that, it's important all classes requiring a SchemaResolver use this instance, as it has an already pre-populated cache.
+        An alternative strategy would be to do an on-demand resolution of the urns/schemas.
+
+        TODO: prove pre-fetch is better strategy than on-demand resolution or make this behaviour configurable.
+        """
         schema_resolution_required = (
             self.config.use_queries_v2 or self.config.lineage_use_sql_parser
         )

diff --git a/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py b/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py
@@ -168,7 +168,10 @@ def resolve_table(self, table: _TableName) -> Tuple[str, Optional[SchemaInfo]]:
                 self._track_cache_hit()
                 return urn_mixed, schema_info
 
-        # Track cache miss for the final attempt
+        logger.debug(
+            f"Schema resolution failed for table {table}. Tried URNs: "
+            f"primary={urn}, lower={urn_lower}, mixed={urn_mixed}"
+        )
         self._track_cache_miss()
 
         if self._prefers_urn_lower():