11
11
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
# See the License for the specific language governing permissions and
13
13
# limitations under the License.
14
- from bson .codec_options import DEFAULT_CODEC_OPTIONS
15
- from pyarrow import Table , timestamp
14
+ from pyarrow import ListArray , StructArray , Table
16
15
17
16
from pymongoarrow .types import _BsonArrowTypes , _get_internal_typemap
18
17
19
- try :
20
- from pymongoarrow .lib import (
21
- BinaryBuilder ,
22
- BoolBuilder ,
23
- CodeBuilder ,
24
- Date32Builder ,
25
- Date64Builder ,
26
- DatetimeBuilder ,
27
- Decimal128Builder ,
28
- DocumentBuilder ,
29
- DoubleBuilder ,
30
- Int32Builder ,
31
- Int64Builder ,
32
- ListBuilder ,
33
- NullBuilder ,
34
- ObjectIdBuilder ,
35
- StringBuilder ,
36
- )
37
-
38
- _TYPE_TO_BUILDER_CLS = {
39
- _BsonArrowTypes .int32 : Int32Builder ,
40
- _BsonArrowTypes .int64 : Int64Builder ,
41
- _BsonArrowTypes .double : DoubleBuilder ,
42
- _BsonArrowTypes .datetime : DatetimeBuilder ,
43
- _BsonArrowTypes .objectid : ObjectIdBuilder ,
44
- _BsonArrowTypes .decimal128 : Decimal128Builder ,
45
- _BsonArrowTypes .string : StringBuilder ,
46
- _BsonArrowTypes .bool : BoolBuilder ,
47
- _BsonArrowTypes .document : DocumentBuilder ,
48
- _BsonArrowTypes .array : ListBuilder ,
49
- _BsonArrowTypes .binary : BinaryBuilder ,
50
- _BsonArrowTypes .code : CodeBuilder ,
51
- _BsonArrowTypes .date32 : Date32Builder ,
52
- _BsonArrowTypes .date64 : Date64Builder ,
53
- _BsonArrowTypes .null : NullBuilder ,
54
- }
55
- except ImportError :
56
- pass
57
-
58
18
59
19
class PyMongoArrowContext :
60
20
"""A context for converting BSON-formatted data to an Arrow Table."""
61
21
62
- def __init__ (self , schema , builder_map , codec_options = None ):
22
+ def __init__ (self , schema , codec_options = None ):
63
23
"""Initialize the context.
64
24
65
25
:Parameters:
@@ -68,60 +28,85 @@ def __init__(self, schema, builder_map, codec_options=None):
68
28
:class:`~pymongoarrow.builders._BuilderBase` instances.
69
29
"""
70
30
self .schema = schema
71
- self .builder_map = builder_map
72
31
if self .schema is None and codec_options is not None :
73
32
self .tzinfo = codec_options .tzinfo
74
33
else :
75
34
self .tzinfo = None
35
+ schema_map = {}
36
+ if self .schema is not None :
37
+ str_type_map = _get_internal_typemap (schema .typemap )
38
+ _parse_types (str_type_map , schema_map , self .tzinfo )
76
39
40
+ < << << << HEAD
77
41
self .raise_on_type_error = schema .raise_on_type_error if schema is not None else False
78
42
self .raise_on_type_null = schema .raise_on_type_null if schema is not None else False
79
43
80
44
@classmethod
81
45
def from_schema (cls , schema , codec_options = DEFAULT_CODEC_OPTIONS ):
82
46
"""Initialize the context from a :class:`~pymongoarrow.schema.Schema`
83
47
instance.
48
+ =======
49
+ # Delayed import to prevent import errors for unbuilt library.
50
+ from pymongoarrow.lib import BuilderManager
51
+ >>>>>>> 5406fc3 (INTPYTHON-165 Refactor nested data handling (#245))
84
52
85
- :Parameters:
86
- - `schema`: Instance of :class:`~pymongoarrow.schema.Schema`.
87
- - `codec_options` (optional): An instance of
88
- :class:`~bson.codec_options.CodecOptions`.
89
- """
90
- if schema is None :
91
- return cls (schema , {}, codec_options )
92
-
93
- builder_map = {}
94
- tzinfo = codec_options .tzinfo
95
- str_type_map = _get_internal_typemap (schema .typemap )
96
- for fname , ftype in str_type_map .items ():
97
- builder_cls = _TYPE_TO_BUILDER_CLS [ftype ]
98
- encoded_fname = fname .encode ("utf-8" )
99
-
100
- # special-case initializing builders for parameterized types
101
- if builder_cls == DatetimeBuilder :
102
- arrow_type = schema .typemap [fname ]
103
- if tzinfo is not None and arrow_type .tz is None :
104
- arrow_type = timestamp (arrow_type .unit , tz = tzinfo )
105
- builder_map [encoded_fname ] = DatetimeBuilder (dtype = arrow_type )
106
- elif builder_cls == DocumentBuilder :
107
- arrow_type = schema .typemap [fname ]
108
- builder_map [encoded_fname ] = DocumentBuilder (arrow_type , tzinfo )
109
- elif builder_cls == ListBuilder :
110
- arrow_type = schema .typemap [fname ]
111
- builder_map [encoded_fname ] = ListBuilder (arrow_type , tzinfo )
112
- elif builder_cls == BinaryBuilder :
113
- subtype = schema .typemap [fname ].subtype
114
- builder_map [encoded_fname ] = BinaryBuilder (subtype )
115
- else :
116
- builder_map [encoded_fname ] = builder_cls ()
117
- return cls (schema , builder_map )
53
+ self.manager = BuilderManager(schema_map, self.schema is not None, self.tzinfo)
54
+
55
+ def process_bson_stream(self, stream):
56
+ self.manager.process_bson_stream(stream, len(stream))
118
57
119
58
def finish(self):
120
- arrays = []
121
- names = []
122
- for fname , builder in self .builder_map .items ():
123
- arrays .append (builder .finish ())
124
- names .append (fname .decode ("utf-8" ))
59
+ array_map = _parse_builder_map(self.manager.finish())
60
+ arrays = list(array_map.values())
125
61
if self.schema is not None:
126
62
return Table.from_arrays(arrays=arrays, schema=self.schema.to_arrow())
127
- return Table .from_arrays (arrays = arrays , names = names )
63
+ return Table.from_arrays(arrays=arrays, names=list(array_map.keys()))
64
+
65
+
66
+ def _parse_builder_map(builder_map):
67
+ # Handle nested builders.
68
+ to_remove = []
69
+ # Traverse the builder map right to left.
70
+ for key, value in reversed(builder_map.items()):
71
+ if value.type_marker == _BsonArrowTypes.document.value:
72
+ names = value.finish()
73
+ full_names = [f"{key}.{name}" for name in names]
74
+ arrs = [builder_map[c] for c in full_names]
75
+ builder_map[key] = StructArray.from_arrays(arrs, names=names)
76
+ to_remove.extend(full_names)
77
+ elif value.type_marker == _BsonArrowTypes.array.value:
78
+ child_name = key + "[]"
79
+ to_remove.append(child_name)
80
+ child = builder_map[child_name]
81
+ builder_map[key] = ListArray.from_arrays(value.finish(), child)
82
+ else:
83
+ builder_map[key] = value.finish()
84
+
85
+ for key in to_remove:
86
+ if key in builder_map:
87
+ del builder_map[key]
88
+
89
+ return builder_map
90
+
91
+
92
+ def _parse_types(str_type_map, schema_map, tzinfo):
93
+ for fname, (ftype, arrow_type) in str_type_map.items():
94
+ schema_map[fname] = ftype, arrow_type
95
+
96
+ # special-case nested builders
97
+ if ftype == _BsonArrowTypes.document.value:
98
+ # construct a sub type map here
99
+ sub_type_map = {}
100
+ for i in range(arrow_type.num_fields):
101
+ field = arrow_type[i]
102
+ sub_name = f"{fname}.{field.name}"
103
+ sub_type_map[sub_name] = field.type
104
+ sub_type_map = _get_internal_typemap(sub_type_map)
105
+ _parse_types(sub_type_map, schema_map, tzinfo)
106
+ elif ftype == _BsonArrowTypes.array.value:
107
+ sub_type_map = {}
108
+ sub_name = f"{fname}[]"
109
+ sub_value_type = arrow_type .value_type
110
+ sub_type_map [sub_name ] = sub_value_type
111
+ sub_type_map = _get_internal_typemap (sub_type_map )
112
+ _parse_types (sub_type_map , schema_map , tzinfo )
0 commit comments