Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions .github/workflows/test-python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,6 @@ jobs:
matrix:
os: ["ubuntu-latest", "macos-latest", "windows-latest"]
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"]
exclude:
- os: "windows-latest"
python-version: "3.13t"
fail-fast: false
name: CPython ${{ matrix.python-version }}-${{ matrix.os }}
steps:
Expand Down Expand Up @@ -89,10 +86,14 @@ jobs:
- name: Ensure imports with no test deps
run: just import-check
- name: Run the tests
if: ${{ ! endsWith(matrix.python-version, 't') }}
env:
UV_PYTHON: ${{matrix.python-version}}
run: just test

- name: Run the tests with no optional deps
env:
UV_PYTHON: ${{matrix.python-version}}
run: just test-no-optional
docs:
runs-on: ubuntu-latest
steps:
Expand Down
5 changes: 5 additions & 0 deletions bindings/python/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@

---

# Changes in Version 1.10.0 (2025/xx/yy)

- Make `pandas` an optional dependency.
- Add support for free-threaded python on Windows.

# Changes in Version 1.9.0 (2025/05/27)

- Providing a schema now enforces strict type adherence for data.
Expand Down
8 changes: 5 additions & 3 deletions bindings/python/justfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import-check:
uv run python -c "from pymongoarrow.lib import libbson_version"

benchmark *args:
uv sync --dev --extra test --extra test-polars
uv sync --dev --extra test --extra test-polars --extra test-pandas
uv run asv run -e --python=$(uv run python -c "import sys;print(sys.executable)") {{args}}

install:
Expand All @@ -23,8 +23,10 @@ install:
uv run pre-commit install

test *args:
uv sync --extra test --extra test-polars || uv sync --extra test
uv run pytest {{args}}
uv run --no-dev --extra test --extra test-polars --extra test-pandas pytest {{args}}

test-no-optional *args:
uv run --no-dev --extra test pytest {{args}}

lint:
uv sync --no-install-project --dev --frozen
Expand Down
24 changes: 17 additions & 7 deletions bindings/python/pymongoarrow/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,11 @@
from decimal import Decimal

import numpy as np
import pandas as pd

try:
import pandas as pd
except ImportError:
pd = None

try:
import polars as pl
Expand Down Expand Up @@ -170,6 +174,9 @@ def _arrow_to_pandas(arrow_table):
See https://arrow.apache.org/docs/python/pandas.html#reducing-memory-use-in-table-to-pandas
for details.
"""
if pd is None:
msg = "pandas is not installed. Try pip install pandas."
raise ValueError(msg)
return arrow_table.to_pandas(split_blocks=True, self_destruct=True)


Expand Down Expand Up @@ -238,10 +245,10 @@ def _arrow_to_numpy(arrow_table, schema=None):

for fname in schema:
dtype = get_numpy_type(schema[fname])
container[fname] = arrow_table[fname].to_numpy()
if dtype == np.str_:
container[fname] = arrow_table[fname].to_pandas().to_numpy(dtype=dtype)
else:
container[fname] = arrow_table[fname].to_numpy()
container[fname] = container[fname].astype(np.str_)

return container


Expand Down Expand Up @@ -427,7 +434,7 @@ def _tabular_generator(tabular, *, exclude_none=False):
yield {k: v for k, v in row.items() if v is not None}
else:
yield row
elif isinstance(tabular, pd.DataFrame):
elif pd is not None and isinstance(tabular, pd.DataFrame):
for row in tabular.to_dict("records"):
if exclude_none:
yield {k: v for k, v in row.items() if not np.isnan(v)}
Expand Down Expand Up @@ -498,7 +505,7 @@ def write(collection, tabular, *, exclude_none: bool = False):
cols = [tabular.column(i).cast(new_types[i]) for i in range(tabular.num_columns)]
tabular = Table.from_arrays(cols, names=tabular.column_names)
_validate_schema(tabular.schema.types)
elif isinstance(tabular, pd.DataFrame):
elif pd is not None and isinstance(tabular, pd.DataFrame):
_validate_schema(ArrowSchema.from_pandas(tabular).types)
elif pl is not None and isinstance(tabular, pl.DataFrame):
tabular = tabular.to_arrow() # zero-copy in most cases and done in tabular_gen anyway
Expand All @@ -523,7 +530,10 @@ def write(collection, tabular, *, exclude_none: bool = False):

# Add handling for special case types.
codec_options = collection.codec_options
type_registry = TypeRegistry([_PandasNACodec(), _DecimalCodec()])
if pd is not None:
type_registry = TypeRegistry([_PandasNACodec(), _DecimalCodec()])
else:
type_registry = TypeRegistry([_DecimalCodec()])
codec_options = codec_options.with_options(type_registry=type_registry)

while cur_offset < tab_size:
Expand Down
20 changes: 14 additions & 6 deletions bindings/python/pymongoarrow/pandas_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,22 @@
import re

import numpy as np
import pandas as pd
import pyarrow as pa
from bson import Binary, Code, Decimal128, ObjectId
from pandas.api.extensions import (
ExtensionArray,
ExtensionDtype,
register_extension_dtype,
)

try:
import pandas as pd
from pandas.api.extensions import (
ExtensionArray,
ExtensionDtype,
register_extension_dtype,
)
except ImportError:
ExtensionDtype = object
ExtensionArray = object

def register_extension_dtype(func):
return func


class PandasBSONDtype(ExtensionDtype):
Expand Down
3 changes: 2 additions & 1 deletion bindings/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ dependencies = [
# Must be kept in sync with "build_sytem.requires" above.
"pyarrow >=20.0,<20.1",
"pymongo >=4.4,<5",
"pandas >=1.3.5,<3",
"numpy>=2.0.1",
"packaging >=23.2",
]
dynamic = ["version"]
Expand All @@ -53,6 +53,7 @@ Tracker = "https://jira.mongodb.org/projects/INTPYTHON/issues"
[project.optional-dependencies]
test = ["pytz", "pytest"]
test-polars = ["polars"]
test-pandas = ["pandas>=1.3.5,<3"]

[tool.setuptools]
zip-safe = false
Expand Down
11 changes: 8 additions & 3 deletions bindings/python/test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,14 @@

import pytest

pytest_plugins = [
"pandas.tests.extension.conftest",
]
try:
import pandas as pd # noqa: F401

pytest_plugins = [
"pandas.tests.extension.conftest",
]
except ImportError:
pass


@pytest.fixture(autouse=True, scope="session")
Expand Down
6 changes: 5 additions & 1 deletion bindings/python/test/pandas_types/test_binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,14 @@
import numpy as np
import pytest
from bson import Binary
from pandas.tests.extension import base

from pymongoarrow.pandas_types import PandasBinary, PandasBinaryArray

try:
from pandas.tests.extension import base
except ImportError:
pytest.skip("skipping pandas tests", allow_module_level=True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add some thing like "To include, create environment like so: uv sync --dev --extra test --extra test-polars --extra test-pandas"

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's already part of just test, that seems redundant

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK. As long as we are steering contributors to the justfile, that they've opened it, we're good.


try:
base.BaseIndexTests
except AttributeError:
Expand Down
7 changes: 6 additions & 1 deletion bindings/python/test/pandas_types/test_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,15 @@
import numpy as np
import pytest
from bson import Code
from pandas.tests.extension import base

from pymongoarrow.pandas_types import PandasCode, PandasCodeArray

try:
from pandas.tests.extension import base
except ImportError:
pytest.skip("skipping pandas tests", allow_module_level=True)


try:
base.BaseIndexTests
except AttributeError:
Expand Down
7 changes: 6 additions & 1 deletion bindings/python/test/pandas_types/test_decimal128.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,15 @@
import numpy as np
import pytest
from bson import Decimal128
from pandas.tests.extension import base

from pymongoarrow.pandas_types import PandasDecimal128, PandasDecimal128Array

try:
from pandas.tests.extension import base
except ImportError:
pytest.skip("skipping pandas tests", allow_module_level=True)


try:
base.BaseIndexTests
except AttributeError:
Expand Down
7 changes: 6 additions & 1 deletion bindings/python/test/pandas_types/test_objectid.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,15 @@
import numpy as np
import pytest
from bson import ObjectId
from pandas.tests.extension import base

from pymongoarrow.pandas_types import PandasObjectId, PandasObjectIdArray

try:
from pandas.tests.extension import base
except ImportError:
pytest.skip("skipping pandas tests", allow_module_level=True)


try:
base.BaseIndexTests
except AttributeError:
Expand Down
4 changes: 4 additions & 0 deletions bindings/python/test/test_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,10 @@ def test_write_error(self):
raise awe

def test_write_schema_validation(self):
try:
import pandas as pd # noqa: F401
except ImportError:
self.skipTest("Test requires pandas")
arrow_schema = {k.__name__: v(True) for k, v in _TYPE_NORMALIZER_FACTORY.items()}
schema = {k: v.to_pandas_dtype() for k, v in arrow_schema.items()}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We use pandas to go from arrow to numpy?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just for this test, to get the right data types

schema["str"] = "str"
Expand Down
9 changes: 7 additions & 2 deletions bindings/python/test/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,8 @@
from test.utils import AllowListEventListener, NullsTestMixin

import numpy as np
import pandas as pd
import pandas.testing
import pyarrow as pa
import pytest
from bson import Binary, Code, CodecOptions, Decimal128, ObjectId
from pyarrow import decimal256, int32, int64
from pymongo import DESCENDING, WriteConcern
Expand All @@ -37,6 +36,12 @@
from pymongoarrow.pandas_types import PandasBSONDtype, PandasDecimal128, PandasObjectId
from pymongoarrow.types import _TYPE_NORMALIZER_FACTORY, Decimal128Type, ObjectIdType

try:
import pandas as pd
import pandas.testing
except ImportError:
pytest.skip("skipping pandas tests", allow_module_level=True)


class PandasTestBase(unittest.TestCase):
@classmethod
Expand Down
31 changes: 30 additions & 1 deletion bindings/python/test/test_pymongoarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import unittest
from test import client_context

from pymongoarrow.api import find_arrow_all
from pymongoarrow.api import find_arrow_all, find_pandas_all, find_polars_all
from pymongoarrow.schema import Schema
from pymongoarrow.version import __version__

Expand All @@ -34,6 +34,35 @@ def test_version(self):
self.assertIsNotNone(__version__)
self.assertIsInstance(__version__, str)

def test_no_pandas(self):
try:
import pandas as pd # noqa: F401

self.skipTest("Requires no pandas")
except ImportError:
pass
self.client.test.drop_collection("test")
schema = Schema({"data": bool})
data = [{"data": False} for _ in range(1000)]
self.client.test.test.insert_many(data)

with self.assertRaises(ValueError):
find_pandas_all(self.client.test.test, {}, schema=schema)

def test_no_polars(self):
try:
import polars as pl # noqa: F401

self.skipTest("Requires no polars")
except ImportError:
pass
self.client.test.drop_collection("test")
schema = Schema({"data": bool})
data = [{"data": False} for _ in range(1000)]
self.client.test.test.insert_many(data)
with self.assertRaises(ValueError):
find_polars_all(self.client.test.test, {}, schema=schema)

def test_capped_collection(self):
self.client.test.drop_collection("test")
self.client.test.create_collection("test", capped=True, size=5000)
Expand Down
25 changes: 21 additions & 4 deletions bindings/python/test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import numpy as np
import pyarrow as pa
from bson import Decimal128, ObjectId
from pandas import isna
from pyarrow import bool_, float64, int64, string, timestamp
from pymongo import WriteConcern, monitoring

Expand All @@ -33,6 +32,24 @@
)


def isnan(inp):
if isinstance(inp, (pa.Array, pa.ChunkedArray)):
inp = inp.to_pylist()

def isnan_inner(value):
if value is None:
return True
# pandas na values
if str(value) in ["<NA>", "NaT"]:
return True
try:
return np.isnan(value)
except TypeError:
return False

return [isnan_inner(v) for v in inp]


class EventListener(monitoring.CommandListener):
def __init__(self):
self.results = defaultdict(list)
Expand Down Expand Up @@ -176,7 +193,7 @@ def test_int_handling(self):
self.assertType(table["int64"], atype)

# Does it contain NAs where we expect?
self.assertTrue(np.all(np.equal(isna(int64_arr), isna(table["int64"]))))
self.assertTrue(np.all(np.equal(isnan(int64_arr), isnan(table["int64"]))))

# Write
self.coll.drop()
Expand Down Expand Up @@ -217,7 +234,7 @@ def test_other_handling(self):
self.assertType(table["other"], con_type)
self.assertEqual(
self.na_safe(con_type),
np.all(np.equal(isna(others), isna(table["other"]))),
np.all(np.equal(isnan(others), isnan(table["other"]))),
)

def writeback():
Expand Down Expand Up @@ -262,4 +279,4 @@ def test_bool_handling(self):
self.assertType(table["bool_"], atype)

# Does it contain Nones where expected?
self.assertTrue(np.all(np.equal(isna(bools), isna(table["bool_"]))))
self.assertTrue(np.all(np.equal(isnan(bools), isnan(table["bool_"]))))
Loading
Loading