diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index d3b94d3808240..af586703a7c5c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -219,6 +219,13 @@ def test_connection(config_dict: dict) -> TestConnectionReport: return BigQueryTestConnection.test_connection(config_dict) def _init_schema_resolver(self) -> SchemaResolver: + """ + The ininitialization of SchemaResolver prefetches all existing urns and schemas in the env/platform/instance. + Because of that, it's important all classes requiring a SchemaResolver use this instance, as it has an already pre-populated cache. + An alternative strategy would be to do an on-demand resolution of the urns/schemas. + + TODO: prove pre-fetch is better strategy than on-demand resolution or make this behaviour configurable. + """ schema_resolution_required = ( self.config.use_queries_v2 or self.config.lineage_use_sql_parser ) diff --git a/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py b/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py index eca043ac57922..d8e85f3e50d1d 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py +++ b/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py @@ -168,7 +168,10 @@ def resolve_table(self, table: _TableName) -> Tuple[str, Optional[SchemaInfo]]: self._track_cache_hit() return urn_mixed, schema_info - # Track cache miss for the final attempt + logger.debug( + f"Schema resolution failed for table {table}. Tried URNs: " + f"primary={urn}, lower={urn_lower}, mixed={urn_mixed}" + ) self._track_cache_miss() if self._prefers_urn_lower():