Skip to content

Commit 2badaa9

Browse files
committed
feat(archive): add segment-based streaming ZIP writer, ZIP64 modes; replace legacy writer
- Introduce SegmentWriter for deterministic, out-of-order segment writing and true streaming output. - Add CRC32 combine (crc32combine) to compute payload CRC without buffering entire payload. - Implement zip primitives (headers, central directory, EOCD) with ZIP64 mode control (Auto/Always/Never) and signed data descriptors. - Replace legacy tdf3 writer; remove old writer tests and reader/writer harness. - Update reader tests and add ZIP64 mode tests and benchmarks. - Integrate new writer into SDK TDF creation; track total size from streamed segments + finalize bytes. - Adjust TDF tests’ expected sizes and tolerances to match new ZIP layout. Notes: Small archives may differ by ~40–60 bytes vs. legacy due to conditional ZIP64 extras and data descriptor signature; output remains ZIP-spec compliant.
1 parent d8fa2eb commit 2badaa9

16 files changed

+1942
-1019
lines changed
Lines changed: 384 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,384 @@
1+
package archive
2+
3+
import (
4+
"testing"
5+
)
6+
7+
// BenchmarkSegmentWriter_CRC32ContiguousProcessing benchmarks CRC32-related performance under
8+
// different segment write ordering patterns. The current implementation uses CRC32-combine over
9+
// per-segment CRCs and sizes and does not retain payload bytes between calls.
10+
//
11+
// Test patterns:
12+
// - sequential: Optimal case where segments arrive in order (enables immediate processing)
13+
// - reverse: Worst case where all segments must be buffered until the end
14+
// - random: Pseudo-random order using deterministic pattern for reproducible results
15+
// - interleaved: Moderate out-of-order (even indices first, then odd)
16+
// - worst_case: Middle-out pattern that maximizes memory buffering requirements
17+
//
18+
// Measures: CRC32 calculation speed, memory allocation patterns, contiguous processing effectiveness
19+
func BenchmarkSegmentWriter_CRC32ContiguousProcessing(b *testing.B) {
20+
testCases := []struct {
21+
name string
22+
segmentCount int
23+
segmentSize int
24+
writeOrder string
25+
}{
26+
{"sequential_100x1KB", 100, 1024, "sequential"},
27+
{"reverse_100x1KB", 100, 1024, "reverse"},
28+
{"random_100x1KB", 100, 1024, "random"},
29+
{"interleaved_100x1KB", 100, 1024, "interleaved"},
30+
{"worst_case_100x1KB", 100, 1024, "worst_case"},
31+
{"sequential_1000x1KB", 1000, 1024, "sequential"},
32+
{"reverse_1000x1KB", 1000, 1024, "reverse"},
33+
{"worst_case_1000x1KB", 1000, 1024, "worst_case"},
34+
}
35+
36+
for _, tc := range testCases {
37+
b.Run(tc.name, func(b *testing.B) {
38+
// Generate write order based on pattern
39+
writeOrder := generateWriteOrder(tc.segmentCount, tc.writeOrder)
40+
41+
b.ResetTimer()
42+
b.ReportAllocs()
43+
44+
for i := 0; i < b.N; i++ {
45+
writer := NewSegmentTDFWriter(tc.segmentCount)
46+
ctx := b.Context()
47+
48+
// Create test segment data
49+
segmentData := make([]byte, tc.segmentSize)
50+
for j := range segmentData {
51+
segmentData[j] = byte(j % 256)
52+
}
53+
54+
// Write segments in specified order
55+
for _, segIdx := range writeOrder {
56+
_, err := writer.WriteSegment(ctx, segIdx, segmentData)
57+
if err != nil {
58+
b.Fatal(err)
59+
}
60+
}
61+
62+
// Finalize to trigger final CRC32 calculation
63+
manifest := []byte(`{"test": "benchmark"}`)
64+
_, err := writer.Finalize(ctx, manifest)
65+
if err != nil {
66+
b.Fatal(err)
67+
}
68+
69+
writer.Close()
70+
}
71+
})
72+
}
73+
}
74+
75+
// BenchmarkSegmentWriter_VariableSegmentSizes benchmarks performance impact of variable segment sizes
76+
// on memory allocation and CRC32 processing efficiency.
77+
//
78+
// This benchmark tests how the segment writer handles dynamic memory allocation when segments have
79+
// unpredictable sizes. Variable sizes can impact both memory allocation patterns and CRC32 processing
80+
// efficiency, as larger segments require more memory and processing time.
81+
//
82+
// Test patterns:
83+
// - uniform_1KB: Baseline with consistent 1KB segments for comparison
84+
// - doubling: Exponentially increasing sizes (512B → 8KB) to test scaling
85+
// - extreme_variance: Mixed small/large segments to stress memory allocator
86+
// - fibonacci_like: Fibonacci-inspired progression for gradual size increases
87+
// - large_mixed: Various large segments to test high memory usage patterns
88+
//
89+
// Measures: Memory allocation efficiency, CRC32 processing with varying data volumes, GC impact
90+
func BenchmarkSegmentWriter_VariableSegmentSizes(b *testing.B) {
91+
testCases := []struct {
92+
name string
93+
sizes []int
94+
}{
95+
{"uniform_1KB", []int{1024, 1024, 1024, 1024, 1024}},
96+
{"doubling", []int{512, 1024, 2048, 4096, 8192}},
97+
{"extreme_variance", []int{100, 10240, 200, 20480, 300}},
98+
{"fibonacci_like", []int{256, 512, 768, 1280, 2048}},
99+
{"large_mixed", []int{1024, 16384, 4096, 32768, 8192}},
100+
}
101+
102+
for _, tc := range testCases {
103+
b.Run(tc.name, func(b *testing.B) {
104+
b.ResetTimer()
105+
b.ReportAllocs()
106+
107+
for i := 0; i < b.N; i++ {
108+
writer := NewSegmentTDFWriter(len(tc.sizes))
109+
ctx := b.Context()
110+
111+
// Write segments with variable sizes
112+
for segIdx, size := range tc.sizes {
113+
segmentData := make([]byte, size)
114+
for j := range segmentData {
115+
segmentData[j] = byte((segIdx * j) % 256)
116+
}
117+
118+
_, err := writer.WriteSegment(ctx, segIdx, segmentData)
119+
if err != nil {
120+
b.Fatal(err)
121+
}
122+
}
123+
124+
// Finalize
125+
manifest := []byte(`{"variable_sizes": true}`)
126+
_, err := writer.Finalize(ctx, manifest)
127+
if err != nil {
128+
b.Fatal(err)
129+
}
130+
131+
writer.Close()
132+
}
133+
})
134+
}
135+
}
136+
137+
// BenchmarkSegmentWriter_MemoryPressure benchmarks memory allocation patterns and buffering efficiency
138+
// under various segment count and size combinations.
139+
//
140+
// This benchmark specifically targets memory allocation behavior to identify potential memory leaks,
141+
// inefficient buffering strategies, and garbage collection impact. It uses WithMaxSegments(count*2)
142+
// to allow extra buffering capacity and tests different buffer policies.
143+
//
144+
// Test scenarios:
145+
// - small_segments: High segment count (1000) with minimal individual memory (512B each)
146+
// - large_segments: Fewer segments (100) with larger memory footprint (8KB each)
147+
// - mixed_sizes: Dynamic sizes from 512B to 4KB based on segment index modulo
148+
//
149+
// Write patterns test memory behavior:
150+
// - sequential: Minimal buffering, immediate processing and cleanup
151+
// - reverse: Maximum buffering until all segments received
152+
// - interleaved: Moderate buffering with periodic cleanup opportunities
153+
// - worst_case: Scattered pattern maximizing memory retention
154+
//
155+
// Measures: Peak memory usage, allocation patterns, buffer cleanup efficiency, GC pressure
156+
func BenchmarkSegmentWriter_MemoryPressure(b *testing.B) {
157+
testCases := []struct {
158+
name string
159+
segmentCount int
160+
segmentSize int
161+
bufferPolicy string
162+
}{
163+
{"small_segments_sequential", 1000, 512, "sequential"},
164+
{"small_segments_reverse", 1000, 512, "reverse"},
165+
{"small_segments_worst_case", 1000, 512, "worst_case"},
166+
{"large_segments_sequential", 100, 8192, "sequential"},
167+
{"large_segments_reverse", 100, 8192, "reverse"},
168+
{"large_segments_interleaved", 100, 8192, "interleaved"},
169+
{"mixed_sizes_random", 500, 0, "mixed"}, // 0 = variable sizes
170+
}
171+
172+
for _, tc := range testCases {
173+
b.Run(tc.name, func(b *testing.B) {
174+
writeOrder := generateWriteOrder(tc.segmentCount, tc.bufferPolicy)
175+
176+
b.ResetTimer()
177+
b.ReportAllocs()
178+
179+
for i := 0; i < b.N; i++ {
180+
writer := NewSegmentTDFWriter(tc.segmentCount, WithMaxSegments(tc.segmentCount*2))
181+
ctx := b.Context()
182+
183+
// Write segments with focus on memory allocation patterns
184+
for orderIdx, segIdx := range writeOrder {
185+
var segmentData []byte
186+
187+
if tc.segmentSize == 0 { // Mixed sizes mode
188+
size := 512 + (segIdx%8)*512 // Sizes from 512 to 4096
189+
segmentData = make([]byte, size)
190+
} else {
191+
segmentData = make([]byte, tc.segmentSize)
192+
}
193+
194+
// Fill with deterministic test data
195+
for j := range segmentData {
196+
segmentData[j] = byte((orderIdx * j) % 256)
197+
}
198+
199+
_, err := writer.WriteSegment(ctx, segIdx, segmentData)
200+
if err != nil {
201+
b.Fatal(err)
202+
}
203+
}
204+
205+
// Finalize
206+
manifest := []byte(`{"memory_test": true}`)
207+
_, err := writer.Finalize(ctx, manifest)
208+
if err != nil {
209+
b.Fatal(err)
210+
}
211+
212+
writer.Close()
213+
}
214+
})
215+
}
216+
}
217+
218+
// BenchmarkSegmentWriter_ZIPGeneration benchmarks ZIP archive structure generation performance,
219+
// focusing on the finalization process where the complete ZIP structure is assembled.
220+
//
221+
// This benchmark measures the overhead of generating ZIP format structures including local file headers,
222+
// central directory records, and data descriptors. It compares ZIP32 vs ZIP64 performance and tests
223+
// the final assembly process during Finalize() calls.
224+
//
225+
// Test scenarios:
226+
// - zip32_small/large: Standard ZIP format (supports files <4GB) with varying segment counts
227+
// - zip64_small/large: ZIP64 format (handles >4GB files) with extended headers
228+
// - zip64_huge_segments: Large 64KB segments that require ZIP64 format
229+
//
230+
// The benchmark focuses on finalization overhead including:
231+
// - Data descriptor generation for streaming entries
232+
// - Central directory assembly with file metadata
233+
// - ZIP64 extended information extra fields when needed
234+
// - Final ZIP structure validation and writing
235+
//
236+
// Measures: ZIP structure generation speed, ZIP32 vs ZIP64 overhead, finalization efficiency
237+
func BenchmarkSegmentWriter_ZIPGeneration(b *testing.B) {
238+
testCases := []struct {
239+
name string
240+
segmentCount int
241+
segmentSize int
242+
zip64Mode Zip64Mode
243+
}{
244+
{"zip32_small", 10, 1024, Zip64Never},
245+
{"zip32_large", 100, 1024, Zip64Never},
246+
{"zip64_small", 10, 1024, Zip64Always},
247+
{"zip64_large", 100, 1024, Zip64Always},
248+
{"zip64_huge_segments", 5, 65536, Zip64Auto}, // Auto triggers ZIP64 by size
249+
}
250+
251+
for _, tc := range testCases {
252+
b.Run(tc.name, func(b *testing.B) {
253+
options := []Option{WithZip64Mode(tc.zip64Mode)}
254+
255+
b.ResetTimer()
256+
b.ReportAllocs()
257+
258+
for i := 0; i < b.N; i++ {
259+
writer := NewSegmentTDFWriter(tc.segmentCount, options...)
260+
ctx := b.Context()
261+
262+
// Create test segment data
263+
segmentData := make([]byte, tc.segmentSize)
264+
for j := range segmentData {
265+
segmentData[j] = byte(j % 256)
266+
}
267+
268+
// Write all segments
269+
for segIdx := 0; segIdx < tc.segmentCount; segIdx++ {
270+
_, err := writer.WriteSegment(ctx, segIdx, segmentData)
271+
if err != nil {
272+
b.Fatal(err)
273+
}
274+
}
275+
276+
// Focus benchmark on finalization (ZIP generation)
277+
manifest := []byte(`{"zip_generation_test": true}`)
278+
_, err := writer.Finalize(ctx, manifest)
279+
if err != nil {
280+
b.Fatal(err)
281+
}
282+
283+
writer.Close()
284+
}
285+
})
286+
}
287+
}
288+
289+
// generateWriteOrder creates deterministic segment write orders for consistent benchmark testing.
290+
//
291+
// This function generates various write patterns to test different aspects of the segment writer's
292+
// performance characteristics. All patterns are deterministic to ensure reproducible benchmark results.
293+
//
294+
// Supported patterns:
295+
// - "sequential": Natural order (0,1,2,3...)
296+
// - "reverse": Backward order (...3,2,1,0)
297+
// - "interleaved": Even indices first (0,2,4...), then odd (1,3,5...) - moderate out-of-order
298+
// - "worst_case": Middle-out pattern starting from center, alternating left/right - maximizes buffering
299+
// - "random"/"mixed": Pseudo-random using modular arithmetic (i*17+7)%count for deterministic chaos
300+
//
301+
// The patterns are designed to stress different aspects:
302+
// - Memory usage patterns
303+
// - Processing efficiency
304+
// - Cache locality (how segments are accessed in memory)
305+
func generateWriteOrder(count int, pattern string) []int {
306+
order := make([]int, count)
307+
308+
switch pattern {
309+
case "sequential":
310+
for i := 0; i < count; i++ {
311+
order[i] = i
312+
}
313+
case "reverse":
314+
for i := 0; i < count; i++ {
315+
order[i] = count - 1 - i
316+
}
317+
case "interleaved":
318+
// Interleaved pattern: 0,2,4,6...1,3,5,7... (moderate out-of-order)
319+
idx := 0
320+
// First pass: even indices
321+
for i := 0; i < count; i += 2 {
322+
if idx < count {
323+
order[idx] = i
324+
idx++
325+
}
326+
}
327+
// Second pass: odd indices
328+
for i := 1; i < count; i += 2 {
329+
if idx < count {
330+
order[idx] = i
331+
idx++
332+
}
333+
}
334+
case "worst_case":
335+
// A scattered pattern that stresses segment bookkeeping
336+
mid := count / 2
337+
order[0] = mid
338+
left, right := mid-1, mid+1
339+
idx := 1
340+
341+
// Alternate left and right from middle
342+
for left >= 0 || right < count {
343+
if left >= 0 && idx < count {
344+
order[idx] = left
345+
idx++
346+
left--
347+
}
348+
if right < count && idx < count {
349+
order[idx] = right
350+
idx++
351+
right++
352+
}
353+
}
354+
case "random", "mixed":
355+
// Generate pseudo-random but deterministic pattern for consistent benchmarks
356+
for i := 0; i < count; i++ {
357+
// Simple deterministic pseudo-random: use modular arithmetic
358+
order[i] = (i*17 + 7) % count
359+
}
360+
// Ensure all indices are covered
361+
used := make(map[int]bool)
362+
result := make([]int, 0, count)
363+
for _, idx := range order {
364+
if !used[idx] {
365+
result = append(result, idx)
366+
used[idx] = true
367+
}
368+
}
369+
// Fill in any missing indices
370+
for i := 0; i < count; i++ {
371+
if !used[i] {
372+
result = append(result, i)
373+
}
374+
}
375+
return result
376+
default:
377+
// Default to sequential
378+
for i := 0; i < count; i++ {
379+
order[i] = i
380+
}
381+
}
382+
383+
return order
384+
}

0 commit comments

Comments
 (0)