some fixes

rok · rok · commit 80ea044a3712 · 2025-10-25T23:57:43.000+02:00
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
@@ -609,7 +609,8 @@ def top_k_unstable(values, k, sort_keys=None, *, memory_pool=None):
         sort_keys.append(("dummy", "descending"))
     else:
         sort_keys = map(lambda key_name: (key_name, "descending"), sort_keys)
-    options = SelectKOptions(k, sort_keys)  # type: ignore
+        assert isinstance(sort_keys, (map, list, tuple))
+    options = SelectKOptions(k, sort_keys)
     return call_function("select_k_unstable", [values], options, memory_pool)
 
 
@@ -656,7 +657,8 @@ def bottom_k_unstable(values, k, sort_keys=None, *, memory_pool=None):
         sort_keys.append(("dummy", "ascending"))
     else:
         sort_keys = map(lambda key_name: (key_name, "ascending"), sort_keys)
-    options = SelectKOptions(k, sort_keys)  # type: ignore
+        assert isinstance(sort_keys, (map, list, tuple))
+    options = SelectKOptions(k, sort_keys)
     return call_function("select_k_unstable", [values], options, memory_pool)
 
 
@@ -682,7 +684,8 @@ def random(n, *, initializer='system', options=None, memory_pool=None):
     memory_pool : pyarrow.MemoryPool, optional
         If not passed, will allocate memory from the default memory pool.
     """
-    options = RandomOptions(initializer=initializer)  # type: ignore
+    options = RandomOptions(
+        initializer=initializer)  # type: ignore[reportArgumentType]
     return call_function("random", [], options, memory_pool, length=n)
 
 
diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py
@@ -54,6 +54,9 @@
         get_partition_keys as _get_partition_keys,  # keep for backwards compatibility
         _filesystemdataset_write,
     )
+    from pyarrow.fs import FileInfo
+
+
 except ImportError as exc:
     raise ImportError(
         f"The pyarrow installation is not built with support for 'dataset' ({str(exc)})"
@@ -429,6 +432,7 @@ def _ensure_single_source(path, filesystem=None):
 
     # retrieve the file descriptor
     file_info = filesystem.get_file_info(path)
+    assert isinstance(file_info, FileInfo)
 
     # depending on the path type either return with a recursive
     # directory selector or as a list containing a single file
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
@@ -92,21 +92,21 @@ def get_logical_type(arrow_type):
 def get_numpy_logical_type_map():
     global _numpy_logical_type_map  # noqa: F824
     if not _numpy_logical_type_map:
-        _numpy_logical_type_map.update({  # type: ignore[arg-type]
-            np.bool_: 'bool',  # type: ignore
-            np.int8: 'int8',  # type: ignore
-            np.int16: 'int16',  # type: ignore
-            np.int32: 'int32',  # type: ignore
-            np.int64: 'int64',  # type: ignore
-            np.uint8: 'uint8',  # type: ignore
-            np.uint16: 'uint16',  # type: ignore
-            np.uint32: 'uint32',  # type: ignore
-            np.uint64: 'uint64',  # type: ignore
-            np.float32: 'float32',  # type: ignore
-            np.float64: 'float64',  # type: ignore
+        _numpy_logical_type_map.update({  # type: ignore[reportCallIssue]
+            np.bool_: 'bool',
+            np.int8: 'int8',
+            np.int16: 'int16',
+            np.int32: 'int32',
+            np.int64: 'int64',
+            np.uint8: 'uint8',
+            np.uint16: 'uint16',
+            np.uint32: 'uint32',
+            np.uint64: 'uint64',
+            np.float32: 'float32',
+            np.float64: 'float64',
             'datetime64[D]': 'date',
-            np.str_: 'string',  # type: ignore
-            np.bytes_: 'bytes',  # type: ignore
+            np.str_: 'string',
+            np.bytes_: 'bytes',
         })
     return _numpy_logical_type_map
 
@@ -779,7 +779,7 @@ def make_datetimetz(unit, tz):
     if _pandas_api.is_v1():
         unit = 'ns'  # ARROW-3789: Coerce date/timestamp types to datetime64[ns]
     tz = pa.lib.string_to_tzinfo(tz)
-    return _pandas_api.datetimetz_type(unit, tz=tz)  # type: ignore
+    return _pandas_api.datetimetz_type(unit, tz=tz)  # type: ignore[reportArgumentType]
 
 
 def table_to_dataframe(
@@ -826,7 +826,8 @@ def table_to_dataframe(
 
         return df
     else:
-        from pandas.core.internals import BlockManager  # type: ignore
+        from pandas.core.internals import (  # type: ignore[reportMissingImports]
+            BlockManager)
         from pandas import DataFrame
 
         blocks = [
diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py
@@ -240,8 +240,8 @@ def test_parquet_raise_on_unset_statistics():
 
     stat = meta.row_group(0).column(0).statistics
     assert stat is not None
-    assert not stat.has_min_max  # type: ignore[attr-defined]
-    assert stat.max is None  # type: ignore[attr-defined]
+    assert not stat.has_min_max
+    assert stat.max is None
 
 
 def test_statistics_convert_logical_types(tempdir):
@@ -507,6 +507,7 @@ def test_multi_dataset_metadata(tempdir):
             _meta.append_row_groups(meta[0])
 
     # Write merged metadata-only file
+    assert _meta is not None
     with open(metapath, "wb") as f:
         _meta.write_metadata_file(f)  # type: ignore[union-attr]
 
diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py
@@ -464,8 +464,9 @@ def test_backwards_compatible_column_metadata_handling(datadir):
     table = _read_table(
         path, columns=['a'])
     result = table.to_pandas()
-    tm.assert_frame_equal(result, cast(
-        pd.DataFrame, expected[['a']].reset_index(drop=True)))
+    expected_df = expected[['a']].reset_index(drop=True)
+    assert isinstance(expected_df, pd.DataFrame)
+    tm.assert_frame_equal(result, expected_df)
 
 
 @pytest.mark.pandas
@@ -525,7 +526,7 @@ def test_pandas_categorical_roundtrip():
     codes = np.array([2, 0, 0, 2, 0, -1, 2], dtype='int32')
     categories = ['foo', 'bar', 'baz']
     df = pd.DataFrame({'x': pd.Categorical.from_codes(
-        codes, categories=pd.Index(categories))})  # type: ignore[arg-type]
+        codes, categories=categories)})  # type: ignore[arg-type]
 
     buf = pa.BufferOutputStream()
     pq.write_table(pa.table(df), buf)
diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py
@@ -324,7 +324,7 @@ def arrays(draw, type, size=None, nullable=True):
                 datetime.timedelta(hours=offset_hours, minutes=offset_min)
             )
         except ValueError:
-            tz = zoneinfo.ZoneInfo(ty.tz)  # type: ignore[union-attr]
+            tz = zoneinfo.ZoneInfo(str(ty.tz))
         value = st.datetimes(timezones=st.just(tz), min_value=min_datetime,
                              max_value=max_datetime)
     elif pa.types.is_duration(ty):
diff --git a/python/pyarrow/tests/test_acero.py b/python/pyarrow/tests/test_acero.py
@@ -90,7 +90,7 @@ def test_declaration_to_reader(table_source):
 def test_table_source():
     with pytest.raises(TypeError):
         TableSourceNodeOptions(pa.record_batch(
-            [pa.array([1, 2, 3])], ["a"]))  # type: ignore[arg-type]
+            [pa.array([1, 2, 3])], ["a"]))
 
     table_source = TableSourceNodeOptions(None)
     decl = Declaration("table_source", table_source)
@@ -387,7 +387,7 @@ def test_hash_join_with_residual_filter():
         "left outer", left_keys="key", right_keys="key",
         filter_expression=(
             pc.equal(pc.field("a"), 5)
-            | pc.equal(pc.field("b"), 10))  # type: ignore[arg-type]
+            | pc.equal(pc.field("b"), 10))  # type: ignore[reportOperatorIssue]
     )
     joined = Declaration(
         "hashjoin", options=join_opts, inputs=[left_source, right_source])
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
@@ -3345,7 +3345,7 @@ def _check_dataset(schema, expected, expected_schema=None):
     # Specifying with differing field types
     schema = pa.schema([('a', 'int32'), ('b', 'float64')])
     dataset = ds.dataset(str(tempdir / "data.parquet"), schema=schema)
-    expected = pa.table([table['a'].cast(pa.int32()),  # type: ignore[arg-type]
+    expected = pa.table([table['a'].cast('int32'),  # type: ignore[arg-type]
                          table['b']],
                         names=['a', 'b'])
     _check_dataset(schema, expected)
@@ -4055,13 +4055,13 @@ def test_filter_mismatching_schema(tempdir, dataset_reader):
     # cast the column
     filtered = dataset_reader.to_table(dataset, filter=ds.field("col") > 2)
     assert filtered["col"].equals(table["col"].cast(
-        pa.int64()).slice(2))  # type: ignore[arg-type]
+        'int64').slice(2))  # type: ignore[arg-type]
 
     fragment = list(dataset.get_fragments())[0]
     filtered = dataset_reader.to_table(
         fragment, filter=ds.field("col") > 2, schema=schema)
     assert filtered["col"].equals(table["col"].cast(
-        pa.int64()).slice(2))  # type: ignore[arg-type]
+        'int64').slice(2))  # type: ignore[arg-type]
 
 
 @pytest.mark.parquet
diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py
@@ -63,7 +63,6 @@
         )
     except ImportError:
         flight = None  # type: ignore[assignment]
-        # type: ignore[assignment, misc]
         FlightClient, FlightServerBase = object, object
         ServerAuthHandler, ClientAuthHandler = (  # type: ignore[misc]
             object, object)  # type: ignore[assignment]
@@ -670,7 +669,6 @@ def start_call(self, info, headers):
         )
         if auth_header:
             values = auth_header[0].split(b' ') if isinstance(
-                # type: ignore[arg-type]
                 auth_header[0], bytes) else auth_header[0].split(' ')
         token = ''
         error_message = 'Invalid credentials'

Original file line number	Diff line number	Diff line change
`@@ -324,7 +324,7 @@ def arrays(draw, type, size=None, nullable=True):`
`324`	`324`	`datetime.timedelta(hours=offset_hours, minutes=offset_min)`
`325`	`325`	`)`
`326`	`326`	`except ValueError:`
`327`		`- tz = zoneinfo.ZoneInfo(ty.tz) # type: ignore[union-attr]`
	`327`	`+ tz = zoneinfo.ZoneInfo(str(ty.tz))`
`328`	`328`	`value = st.datetimes(timezones=st.just(tz), min_value=min_datetime,`
`329`	`329`	`max_value=max_datetime)`
`330`	`330`	`elif pa.types.is_duration(ty):`