Skip to content

Commit 1b36884

Browse files
committed
Add virtual dataset support to JLD2.jl
1 parent 833dc98 commit 1b36884

11 files changed

+1568
-33
lines changed

docs/src/advanced.md

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,4 +171,97 @@ This can be useful for performance when one expects to append many additional da
171171
## Fallback Behaviour
172172
By default JLD2 will attempt to open files using the `MmapIO` backend. If that fails, it retries using `IOStream`.
173173

174+
## Virtual Datasets
175+
176+
Virtual datasets (VDS) allow you to create datasets that reference data from multiple source files without copying the data. This is useful for combining large distributed datasets efficiently.
177+
178+
### Basic Usage
179+
180+
Create a virtual dataset mapping entire source files:
181+
182+
```julia
183+
using JLD2
184+
185+
# Create source files
186+
jldsave("data1.jld2"; x = fill(1.0, 3))
187+
jldsave("data2.jld2"; x = fill(2.0, 3))
188+
189+
# Create virtual dataset
190+
jldopen("virtual.jld2", "w") do f
191+
mappings = [
192+
JLD2.VirtualMapping("./data1.jld2", "x"),
193+
JLD2.VirtualMapping("./data2.jld2", "x")
194+
]
195+
JLD2.create_virtual_dataset(f, "combined", (3, 2), Float64, mappings)
196+
end
197+
198+
# Read back
199+
data = jldopen("virtual.jld2", "r") do f
200+
f["combined"] # Returns [1.0 2.0; 1.0 2.0; 1.0 2.0]
201+
end
202+
```
203+
204+
### Selection Methods
205+
206+
Virtual mappings support three ways to specify regions:
207+
208+
**1. Julia index ranges (recommended)**
209+
```julia
210+
mapping = JLD2.VirtualMapping("./data.jld2", "measurements";
211+
vds_indices=(1:1, 1:5)) # Place in first row, columns 1-5
212+
213+
mapping = JLD2.VirtualMapping("./data.jld2", "measurements";
214+
src_indices=(1:10, 5:15), # Take rows 1-10, cols 5-15 from source
215+
vds_indices=(1:10, 1:11)) # Place at rows 1-10, cols 1-11 in VDS
216+
```
217+
218+
**2. Root index + shape (most intuitive)**
219+
```julia
220+
mapping = JLD2.VirtualMapping("./data.jld2", "measurements";
221+
vds_root=(2, 1), # Start at row 2, column 1
222+
vds_shape=(1, 5)) # Block is 1 row × 5 columns
223+
224+
mapping = JLD2.VirtualMapping("./data.jld2", "measurements";
225+
src_root=(5, 10), src_shape=(3, 4), # Take 3×4 block from source
226+
vds_root=(1, 1), vds_shape=(3, 4)) # Place at top-left of VDS
227+
```
228+
229+
**3. Direct HyperslabSelection (advanced)**
230+
```julia
231+
vds_sel = JLD2.HyperslabSelection([0x0, 0x0], [0x1, 0x1], [0x1, 0x1], [0x5, 0x1])
232+
mapping = JLD2.VirtualMapping("./data.jld2", "measurements"; vds_selection=vds_sel)
233+
```
234+
235+
### Strided Selections
236+
237+
Select non-contiguous regions using strided ranges:
238+
239+
```julia
240+
# Every other row
241+
mapping = JLD2.VirtualMapping("./data.jld2", "measurements";
242+
vds_indices=(1:2:10, 1:5)) # Rows 1, 3, 5, 7, 9 in VDS
243+
```
244+
245+
### Automatic Inference
246+
247+
Automatically infer dimensions and types from source files:
248+
249+
```julia
250+
jldopen("virtual.jld2", "w") do f
251+
source_files = ["./data1.jld2", "./data2.jld2", "./data3.jld2"]
252+
253+
# Automatically determines dimensions and element type
254+
JLD2.create_virtual_dataset(f, "combined", source_files, "measurements")
255+
end
256+
```
257+
258+
### Pattern-based File Names
259+
260+
Use `%b` for sequential file patterns:
261+
262+
```julia
263+
# Expands to sub-0.jld2, sub-1.jld2, etc.
264+
mapping = JLD2.VirtualMapping("./sub-%b.jld2", "dataset")
265+
```
266+
174267

src/JLD2.jl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ function jldopen(fname::AbstractString, wr::Bool, create::Bool, truncate::Bool,
180180
parallel_read::Bool=false,
181181
plain::Bool=false
182182
) where T<:Union{Type{IOStream},Type{MmapIO}}
183-
183+
184184
mmaparrays && @warn "mmaparrays keyword is currently ignored" maxlog = 1
185185
filters = Filters.normalize_filters(compress)
186186

@@ -501,6 +501,8 @@ include("Filters.jl")
501501
using .Filters: WrittenFilterPipeline, FilterPipeline, iscompressed
502502
using .Filters: Shuffle, Deflate, ZstdFilter
503503

504+
include("virtual_datasets.jl")
505+
include("virtual_datasets_patternbased.jl")
504506
include("datasets.jl")
505507
include("global_heaps.jl")
506508
include("fractal_heaps.jl")

src/datalayouts.jl

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ struct DataLayout
1515
end
1616

1717
ischunked(dl::DataLayout) = dl.storage_type == LcChunked
18+
isvirtual(dl::DataLayout) = dl.storage_type == LcVirtual
1819
DataLayout(f::JLD2.JLDFile, msg_::Hmessage) =
1920
DataLayout(f, HmWrap(HmDataLayout, msg_))
2021

@@ -44,7 +45,14 @@ function DataLayout(f::JLD2.JLDFile, msg::HmWrap{HmDataLayout})
4445

4546
chunk_dimensions = Int[msg.dimensions[1:end-1]...] # drop element size as last dimension
4647
chunked_storage = true
47-
DataLayout(version, storage_type, data_length, data_offset, msg.dimensionality, 0, chunk_dimensions)
48+
DataLayout(version, storage_type, data_length, data_offset, msg.dimensionality, 0, chunk_dimensions)
49+
elseif storage_type == LcVirtual
50+
# Virtual dataset layout
51+
data_length = -1 # Virtual datasets don't have a fixed data length
52+
heap_address = msg.data_address
53+
index = msg.index
54+
# Store the global heap address in data_offset for now
55+
DataLayout(version, storage_type, data_length, fileoffset(f, heap_address), 0, index, UInt64[])
4856
else
4957
throw(UnsupportedFeatureException("Unknown data layout"))
5058
end

src/datasets.jl

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,12 +78,67 @@ Otherwise, `datatype_offset` points to the offset of the datatype attribute.
7878
v = Array{T, 1}()
7979
track_weakref!(f, header_offset, v)
8080
return v
81+
elseif isvirtual(layout)
82+
# Handle virtual dataset
83+
return read_virtual_data(f, dataspace, dt, layout, filters, header_offset, attributes)
8184
end
8285
seek(f.io, layout.data_offset)
8386
read_dataspace = (dataspace, header_offset, layout, filters)
8487
read_data(f, rr, read_dataspace, attributes)
8588
end
8689

90+
function read_virtual_data(f::JLDFile, dataspace::ReadDataspace,
91+
@nospecialize(dt::H5Datatype),
92+
layout::DataLayout,
93+
filters::FilterPipeline,
94+
header_offset::RelOffset,
95+
attributes::Union{Vector{ReadAttribute},Nothing})
96+
# Read virtual dataset layout from global heap
97+
hid = GlobalHeapID(h5offset(f, layout.data_offset), UInt32(layout.chunk_indexing_type))
98+
99+
io = f.io
100+
# Find the global heap
101+
if haskey(f.global_heaps, hid.heap_offset)
102+
gh = f.global_heaps[hid.heap_offset]
103+
else
104+
seek(io, fileoffset(f, hid.heap_offset))
105+
f.global_heaps[hid.heap_offset] = gh = jlread(io, GlobalHeap)
106+
end
107+
108+
# Seek to the heap object
109+
seek(io, gh.objects[hid.index] + 8) # Skip object index, ref count, reserved
110+
obj_size = Int(jlread(io, Length))
111+
112+
# Read the virtual dataset global heap block (Version 0 format)
113+
version = jlread(io, UInt8)
114+
version != 0 && throw(UnsupportedVersionException(
115+
"Only virtual dataset heap block version 0 is currently supported, got version $version"))
116+
117+
# Read number of entries (8 bytes for "Size of Lengths")
118+
num_entries = Int(jlread(io, UInt64))
119+
120+
# Read each mapping
121+
mappings = VirtualMapping[]
122+
for i in 1:num_entries
123+
# Read source filename (null-terminated string)
124+
source_filename = read_bytestring(io)
125+
126+
# Read source dataset name (null-terminated string)
127+
source_dataset = read_bytestring(io)
128+
129+
# Read source selection
130+
src_selection = jlread(io, DataspaceSelection)
131+
132+
# Read virtual selection
133+
vds_selection = jlread(io, DataspaceSelection)
134+
135+
push!(mappings, VirtualMapping(source_filename, source_dataset, src_selection, vds_selection))
136+
end
137+
138+
# Process the virtual dataset mappings to create the combined dataset
139+
return combine_virtual_mappings(f, mappings, dataspace, dt)
140+
end
141+
87142
# Most types can only be scalars or arrays
88143
@nospecializeinfer function read_data(f::JLDFile,
89144
@nospecialize(rr),

src/dataspaces.jl

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,21 @@ struct WriteDataspace{N,A<:Tuple}
1212
dataspace_type::UInt8
1313
size::NTuple{N,Length}
1414
attributes::A
15+
max_dimensions::NTuple{N,UInt64}
1516
end
1617

18+
# Outer constructors for convenience
19+
WriteDataspace(dataspace_type::UInt8, size::NTuple{N,UInt64}, attributes::A) where {N,A<:Tuple} =
20+
WriteDataspace{N,A}(dataspace_type, size, attributes, size)
21+
1722
struct ReadDataspace
1823
dataspace_type::UInt8
1924
dimensionality::UInt8
2025
dimensions_offset::Int64
2126
end
2227
ReadDataspace() = ReadDataspace(DS_SCALAR, 0, -1)
2328

24-
ReadDataspace(f, msg_::Union{Hmessage, Message}) =
29+
ReadDataspace(f, msg_::Union{Hmessage, Message}) =
2530
ReadDataspace(f, HmWrap(HmDataspace, msg_))
2631
ReadDataspace(f, msg::HmWrap{HmDataspace}) =
2732
ReadDataspace(msg.dataspace_type, msg.dimensionality, fileoffset(f, msg.dim_offset))
@@ -105,4 +110,4 @@ function jlwrite(io::IO, dspace::WriteDataspace{N}) where N
105110
for x in dspace.size
106111
jlwrite(io, x::Length)
107112
end
108-
end
113+
end

src/global_heaps.jl

Lines changed: 31 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -16,33 +16,26 @@ isatend(f::JLDFile, gh::GlobalHeap) =
1616
heap_object_length(data::AbstractArray) = length(data)
1717
heap_object_length(::Any) = 1
1818

19-
function write_heap_object(f::JLDFile, odr::ODR, data, wsession::JLDWriteSession) where ODR
20-
# The type parameter ODR is needed to convince the compiler to specialize on ODR.
21-
psz = odr_sizeof(odr) * heap_object_length(data)
22-
objsz = 8 + jlsizeof(Length) + psz
23-
objsz += 8 - mod1(objsz, 8)
19+
"""
20+
allocate_in_global_heap(f::JLDFile, objsz::Int) -> GlobalHeap
21+
22+
Allocate space in the global heap for an object of size `objsz`.
23+
Returns the GlobalHeap that has space for the object.
24+
25+
Allocation strategy:
26+
1. Use existing heap if object fits
27+
2. Extend existing heap if it's at end of file
28+
3. Create new heap otherwise
29+
"""
30+
function allocate_in_global_heap(f::JLDFile, objsz::Int)
2431
io = f.io
2532

26-
# This is basically a memory allocation problem. Right now we do it
27-
# in a pretty naive way. We:
28-
#
29-
# 1. Put the object in the last created global heap if it fits
30-
# 2. Extend the last global heap if it's at the end of the file
31-
# 3. Create a new global heap if we can't do 1 or 2
32-
#
33-
# This is not a great approach if we're writing objects of
34-
# different sizes interspersed with new datasets. The torture case
35-
# would be a Vector{Any} of mutable objects, some of which contain
36-
# large (>4080 byte) strings and some of which contain small
37-
# strings. In that case, we'd be better off trying to put the small
38-
# strings into existing heaps, rather than writing new ones. This
39-
# should be revisited at a later date.
40-
4133
# Can only fit up to typemax(UInt16) items in a single heap
4234
heap_filled = length(f.global_heap.objects) >= typemax(UInt16)
35+
4336
if objsz + 8 + jlsizeof(Length) < f.global_heap.free && !heap_filled
4437
# Fits in existing global heap
45-
gh = f.global_heap
38+
return f.global_heap
4639
elseif isatend(f, f.global_heap) && !heap_filled
4740
# Global heap is at end and can be extended
4841
gh = f.global_heap
@@ -52,6 +45,7 @@ function write_heap_object(f::JLDFile, odr::ODR, data, wsession::JLDWriteSession
5245
seek(io, gh.offset + 8)
5346
jlwrite(io, gh.length)
5447
f.end_of_data += delta
48+
return gh
5549
else
5650
# Need to create a new global heap
5751
heapsz = max(objsz, 4096)
@@ -63,9 +57,21 @@ function write_heap_object(f::JLDFile, odr::ODR, data, wsession::JLDWriteSession
6357
f.end_of_data = position(io) + heapsz
6458
gh = f.global_heap = f.global_heaps[h5offset(f, offset)] =
6559
GlobalHeap(offset, heapsz, heapsz, Int64[])
60+
return gh
6661
end
62+
end
63+
64+
function write_heap_object(f::JLDFile, odr::ODR, data, wsession::JLDWriteSession) where ODR
65+
# The type parameter ODR is needed to convince the compiler to specialize on ODR.
66+
psz = odr_sizeof(odr) * heap_object_length(data)
67+
objsz = 8 + jlsizeof(Length) + psz
68+
objsz += 8 - mod1(objsz, 8)
69+
io = f.io
70+
71+
# Allocate space in global heap
72+
gh = allocate_in_global_heap(f, objsz)
6773

68-
# Write data
74+
# Write object header
6975
index = length(gh.objects) + 1
7076
objoffset = gh.offset + 8 + jlsizeof(Length) + gh.length - gh.free
7177
seek(io, objoffset)
@@ -74,7 +80,7 @@ function write_heap_object(f::JLDFile, odr::ODR, data, wsession::JLDWriteSession
7480
jlwrite(io, UInt32(0)) # Reserved
7581
jlwrite(io, Length(psz)) # Object size
7682

77-
# Update global heap object
83+
# Update global heap
7884
gh.free -= objsz
7985
push!(gh.objects, objoffset)
8086

@@ -85,9 +91,9 @@ function write_heap_object(f::JLDFile, odr::ODR, data, wsession::JLDWriteSession
8591
jlwrite(io, Length(gh.free - 8 - jlsizeof(Length))) # Object size
8692
end
8793

88-
# Write data
94+
# Write actual data
8995
seek(io, objoffset + 8 + jlsizeof(Length))
90-
write_data(io, f, data, odr, datamode(odr), wsession) # Object data
96+
write_data(io, f, data, odr, datamode(odr), wsession)
9197

9298
GlobalHeapID(h5offset(f, gh.offset), index)
9399
end

src/headermessages.jl

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@
1111
(version == 1) && dataspace_type::@computed(DS_V1)
1212
version == 1 && @skip(5)
1313
dim_offset::@Offset
14-
dimensions::NTuple{Int(dimensionality), Int64}
15-
isset(flags,0) && max_dimension_size::NTuple{Int(dimensionality), Int64}
14+
dimensions::NTuple{Int(dimensionality), UInt64}
15+
if isset(flags,0)
16+
max_dimension_size::NTuple{Int(dimensionality), UInt64} = kw.dimensions
17+
end
1618
end
1719

1820
@pseudostruct HmLinkInfo begin
@@ -148,8 +150,10 @@ end
148150
data_address::RelOffset
149151
end
150152
if layout_class == LcVirtual # Virtual Storage
151-
data_address::RelOffset
152-
index::UInt32
153+
# Virtual Dataset Storage Layout
154+
# Points to global heap containing virtual dataset mappings
155+
data_address::RelOffset # Global heap address containing VDS mappings
156+
index::UInt32 # Index within the global heap collection
153157
end
154158
end
155159
end

0 commit comments

Comments
 (0)