-
Notifications
You must be signed in to change notification settings - Fork 24
feat: add segment-based streaming ZIP writer, ZIP64 modes; replace legacy writer #2723
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
strantalis
wants to merge
4
commits into
opentdf:main
Choose a base branch
from
strantalis:feature/archive-writer
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,371 @@ | ||
| package archive | ||
|
|
||
| import ( | ||
| "math/rand" | ||
| "testing" | ||
| ) | ||
|
|
||
| // BenchmarkSegmentWriter_CRC32ContiguousProcessing benchmarks CRC32-related performance under | ||
| // different segment write ordering patterns. The current implementation uses CRC32-combine over | ||
| // per-segment CRCs and sizes and does not retain payload bytes between calls. | ||
| // | ||
| // Test patterns: | ||
| // - sequential: Optimal case where segments arrive in order (enables immediate processing) | ||
| // - reverse: Worst case where all segments must be buffered until the end | ||
| // - random: Pseudo-random order using deterministic pattern for reproducible results | ||
| // - interleaved: Moderate out-of-order (even indices first, then odd) | ||
| // - worst_case: Middle-out pattern that maximizes memory buffering requirements | ||
| // | ||
| // Measures: CRC32 calculation speed, memory allocation patterns, contiguous processing effectiveness | ||
| func BenchmarkSegmentWriter_CRC32ContiguousProcessing(b *testing.B) { | ||
| testCases := []struct { | ||
| name string | ||
| segmentCount int | ||
| segmentSize int | ||
| writeOrder string | ||
| }{ | ||
| {"sequential_100x1KB", 100, 1024, "sequential"}, | ||
| {"reverse_100x1KB", 100, 1024, "reverse"}, | ||
| {"random_100x1KB", 100, 1024, "random"}, | ||
| {"interleaved_100x1KB", 100, 1024, "interleaved"}, | ||
| {"worst_case_100x1KB", 100, 1024, "worst_case"}, | ||
| {"sequential_1000x1KB", 1000, 1024, "sequential"}, | ||
| {"reverse_1000x1KB", 1000, 1024, "reverse"}, | ||
| {"worst_case_1000x1KB", 1000, 1024, "worst_case"}, | ||
| } | ||
|
|
||
| for _, tc := range testCases { | ||
| b.Run(tc.name, func(b *testing.B) { | ||
| // Generate write order based on pattern | ||
| writeOrder := generateWriteOrder(tc.segmentCount, tc.writeOrder) | ||
|
|
||
| b.ResetTimer() | ||
| b.ReportAllocs() | ||
|
|
||
| for i := 0; i < b.N; i++ { | ||
| writer := NewSegmentTDFWriter(tc.segmentCount) | ||
| ctx := b.Context() | ||
|
|
||
| // Create test segment data | ||
| segmentData := make([]byte, tc.segmentSize) | ||
| for j := range segmentData { | ||
| segmentData[j] = byte(j % 256) | ||
| } | ||
|
|
||
| // Write segments in specified order | ||
| for _, segIdx := range writeOrder { | ||
| _, err := writer.WriteSegment(ctx, segIdx, segmentData) | ||
| if err != nil { | ||
| b.Fatal(err) | ||
| } | ||
| } | ||
|
|
||
| // Finalize to trigger final CRC32 calculation | ||
| manifest := []byte(`{"test": "benchmark"}`) | ||
| _, err := writer.Finalize(ctx, manifest) | ||
| if err != nil { | ||
| b.Fatal(err) | ||
| } | ||
|
|
||
| writer.Close() | ||
| } | ||
| }) | ||
| } | ||
| } | ||
|
|
||
| // BenchmarkSegmentWriter_VariableSegmentSizes benchmarks performance impact of variable segment sizes | ||
| // on memory allocation and CRC32 processing efficiency. | ||
| // | ||
| // This benchmark tests how the segment writer handles dynamic memory allocation when segments have | ||
| // unpredictable sizes. Variable sizes can impact both memory allocation patterns and CRC32 processing | ||
| // efficiency, as larger segments require more memory and processing time. | ||
| // | ||
| // Test patterns: | ||
| // - uniform_1KB: Baseline with consistent 1KB segments for comparison | ||
| // - doubling: Exponentially increasing sizes (512B → 8KB) to test scaling | ||
| // - extreme_variance: Mixed small/large segments to stress memory allocator | ||
| // - fibonacci_like: Fibonacci-inspired progression for gradual size increases | ||
| // - large_mixed: Various large segments to test high memory usage patterns | ||
| // | ||
| // Measures: Memory allocation efficiency, CRC32 processing with varying data volumes, GC impact | ||
| func BenchmarkSegmentWriter_VariableSegmentSizes(b *testing.B) { | ||
| testCases := []struct { | ||
| name string | ||
| sizes []int | ||
| }{ | ||
| {"uniform_1KB", []int{1024, 1024, 1024, 1024, 1024}}, | ||
| {"doubling", []int{512, 1024, 2048, 4096, 8192}}, | ||
| {"extreme_variance", []int{100, 10240, 200, 20480, 300}}, | ||
| {"fibonacci_like", []int{256, 512, 768, 1280, 2048}}, | ||
| {"large_mixed", []int{1024, 16384, 4096, 32768, 8192}}, | ||
| } | ||
|
|
||
| for _, tc := range testCases { | ||
| b.Run(tc.name, func(b *testing.B) { | ||
| b.ResetTimer() | ||
| b.ReportAllocs() | ||
|
|
||
| for i := 0; i < b.N; i++ { | ||
| writer := NewSegmentTDFWriter(len(tc.sizes)) | ||
| ctx := b.Context() | ||
|
|
||
| // Write segments with variable sizes | ||
| for segIdx, size := range tc.sizes { | ||
| segmentData := make([]byte, size) | ||
| for j := range segmentData { | ||
| segmentData[j] = byte((segIdx * j) % 256) | ||
| } | ||
|
|
||
| _, err := writer.WriteSegment(ctx, segIdx, segmentData) | ||
| if err != nil { | ||
| b.Fatal(err) | ||
| } | ||
| } | ||
|
|
||
| // Finalize | ||
| manifest := []byte(`{"variable_sizes": true}`) | ||
| _, err := writer.Finalize(ctx, manifest) | ||
| if err != nil { | ||
| b.Fatal(err) | ||
| } | ||
|
|
||
| writer.Close() | ||
| } | ||
| }) | ||
| } | ||
| } | ||
|
|
||
| // BenchmarkSegmentWriter_MemoryPressure benchmarks memory allocation patterns and buffering efficiency | ||
| // under various segment count and size combinations. | ||
| // | ||
| // This benchmark specifically targets memory allocation behavior to identify potential memory leaks, | ||
| // inefficient buffering strategies, and garbage collection impact. It uses WithMaxSegments(count*2) | ||
| // to allow extra buffering capacity and tests different buffer policies. | ||
| // | ||
| // Test scenarios: | ||
| // - small_segments: High segment count (1000) with minimal individual memory (512B each) | ||
| // - large_segments: Fewer segments (100) with larger memory footprint (8KB each) | ||
| // - mixed_sizes: Dynamic sizes from 512B to 4KB based on segment index modulo | ||
| // | ||
| // Write patterns test memory behavior: | ||
| // - sequential: Minimal buffering, immediate processing and cleanup | ||
| // - reverse: Maximum buffering until all segments received | ||
| // - interleaved: Moderate buffering with periodic cleanup opportunities | ||
| // - worst_case: Scattered pattern maximizing memory retention | ||
| // | ||
| // Measures: Peak memory usage, allocation patterns, buffer cleanup efficiency, GC pressure | ||
| func BenchmarkSegmentWriter_MemoryPressure(b *testing.B) { | ||
| testCases := []struct { | ||
| name string | ||
| segmentCount int | ||
| segmentSize int | ||
| bufferPolicy string | ||
| }{ | ||
| {"small_segments_sequential", 1000, 512, "sequential"}, | ||
| {"small_segments_reverse", 1000, 512, "reverse"}, | ||
| {"small_segments_worst_case", 1000, 512, "worst_case"}, | ||
| {"large_segments_sequential", 100, 8192, "sequential"}, | ||
| {"large_segments_reverse", 100, 8192, "reverse"}, | ||
| {"large_segments_interleaved", 100, 8192, "interleaved"}, | ||
| {"mixed_sizes_random", 500, 0, "mixed"}, // 0 = variable sizes | ||
| } | ||
|
|
||
| for _, tc := range testCases { | ||
| b.Run(tc.name, func(b *testing.B) { | ||
| writeOrder := generateWriteOrder(tc.segmentCount, tc.bufferPolicy) | ||
|
|
||
| b.ResetTimer() | ||
| b.ReportAllocs() | ||
|
|
||
| for i := 0; i < b.N; i++ { | ||
| writer := NewSegmentTDFWriter(tc.segmentCount, WithMaxSegments(tc.segmentCount*2)) | ||
| ctx := b.Context() | ||
|
|
||
| // Write segments with focus on memory allocation patterns | ||
| for orderIdx, segIdx := range writeOrder { | ||
| var segmentData []byte | ||
|
|
||
| if tc.segmentSize == 0 { // Mixed sizes mode | ||
| size := 512 + (segIdx%8)*512 // Sizes from 512 to 4096 | ||
| segmentData = make([]byte, size) | ||
| } else { | ||
| segmentData = make([]byte, tc.segmentSize) | ||
| } | ||
|
|
||
| // Fill with deterministic test data | ||
| for j := range segmentData { | ||
| segmentData[j] = byte((orderIdx * j) % 256) | ||
| } | ||
|
|
||
| _, err := writer.WriteSegment(ctx, segIdx, segmentData) | ||
| if err != nil { | ||
| b.Fatal(err) | ||
| } | ||
| } | ||
|
|
||
| // Finalize | ||
| manifest := []byte(`{"memory_test": true}`) | ||
| _, err := writer.Finalize(ctx, manifest) | ||
| if err != nil { | ||
| b.Fatal(err) | ||
| } | ||
|
|
||
| writer.Close() | ||
| } | ||
| }) | ||
| } | ||
| } | ||
|
|
||
| // BenchmarkSegmentWriter_ZIPGeneration benchmarks ZIP archive structure generation performance, | ||
| // focusing on the finalization process where the complete ZIP structure is assembled. | ||
| // | ||
| // This benchmark measures the overhead of generating ZIP format structures including local file headers, | ||
| // central directory records, and data descriptors. It compares ZIP32 vs ZIP64 performance and tests | ||
| // the final assembly process during Finalize() calls. | ||
| // | ||
| // Test scenarios: | ||
| // - zip32_small/large: Standard ZIP format (supports files <4GB) with varying segment counts | ||
| // - zip64_small/large: ZIP64 format (handles >4GB files) with extended headers | ||
| // - zip64_huge_segments: Large 64KB segments that require ZIP64 format | ||
| // | ||
| // The benchmark focuses on finalization overhead including: | ||
| // - Data descriptor generation for streaming entries | ||
| // - Central directory assembly with file metadata | ||
| // - ZIP64 extended information extra fields when needed | ||
| // - Final ZIP structure validation and writing | ||
| // | ||
| // Measures: ZIP structure generation speed, ZIP32 vs ZIP64 overhead, finalization efficiency | ||
| func BenchmarkSegmentWriter_ZIPGeneration(b *testing.B) { | ||
| testCases := []struct { | ||
| name string | ||
| segmentCount int | ||
| segmentSize int | ||
| zip64Mode Zip64Mode | ||
| }{ | ||
| {"zip32_small", 10, 1024, Zip64Never}, | ||
| {"zip32_large", 100, 1024, Zip64Never}, | ||
| {"zip64_small", 10, 1024, Zip64Always}, | ||
| {"zip64_large", 100, 1024, Zip64Always}, | ||
| {"zip64_huge_segments", 5, 65536, Zip64Auto}, // Auto triggers ZIP64 by size | ||
| } | ||
|
|
||
| for _, tc := range testCases { | ||
| b.Run(tc.name, func(b *testing.B) { | ||
| options := []Option{WithZip64Mode(tc.zip64Mode)} | ||
|
|
||
| b.ResetTimer() | ||
| b.ReportAllocs() | ||
|
|
||
| for i := 0; i < b.N; i++ { | ||
| writer := NewSegmentTDFWriter(tc.segmentCount, options...) | ||
| ctx := b.Context() | ||
|
|
||
| // Create test segment data | ||
| segmentData := make([]byte, tc.segmentSize) | ||
| for j := range segmentData { | ||
| segmentData[j] = byte(j % 256) | ||
| } | ||
|
|
||
| // Write all segments | ||
| for segIdx := 0; segIdx < tc.segmentCount; segIdx++ { | ||
| _, err := writer.WriteSegment(ctx, segIdx, segmentData) | ||
| if err != nil { | ||
| b.Fatal(err) | ||
| } | ||
| } | ||
|
|
||
| // Focus benchmark on finalization (ZIP generation) | ||
| manifest := []byte(`{"zip_generation_test": true}`) | ||
| _, err := writer.Finalize(ctx, manifest) | ||
| if err != nil { | ||
| b.Fatal(err) | ||
| } | ||
|
|
||
| writer.Close() | ||
| } | ||
| }) | ||
| } | ||
| } | ||
|
|
||
| // generateWriteOrder creates deterministic segment write orders for consistent benchmark testing. | ||
| // | ||
| // This function generates various write patterns to test different aspects of the segment writer's | ||
| // performance characteristics. All patterns are deterministic to ensure reproducible benchmark results. | ||
| // | ||
| // Supported patterns: | ||
| // - "sequential": Natural order (0,1,2,3...) | ||
| // - "reverse": Backward order (...3,2,1,0) | ||
| // - "interleaved": Even indices first (0,2,4...), then odd (1,3,5...) - moderate out-of-order | ||
| // - "worst_case": Middle-out pattern starting from center, alternating left/right - maximizes buffering | ||
| // - "random"/"mixed": Deterministic shuffle using rand.Shuffle with a fixed seed | ||
| // | ||
| // The patterns are designed to stress different aspects: | ||
| // - Memory usage patterns | ||
| // - Processing efficiency | ||
| // - Cache locality (how segments are accessed in memory) | ||
| func generateWriteOrder(count int, pattern string) []int { | ||
| order := make([]int, count) | ||
|
|
||
| switch pattern { | ||
| case "sequential": | ||
| for i := 0; i < count; i++ { | ||
| order[i] = i | ||
| } | ||
| case "reverse": | ||
| for i := 0; i < count; i++ { | ||
| order[i] = count - 1 - i | ||
| } | ||
| case "interleaved": | ||
| // Interleaved pattern: 0,2,4,6...1,3,5,7... (moderate out-of-order) | ||
| idx := 0 | ||
| // First pass: even indices | ||
| for i := 0; i < count; i += 2 { | ||
| if idx < count { | ||
| order[idx] = i | ||
| idx++ | ||
| } | ||
| } | ||
| // Second pass: odd indices | ||
| for i := 1; i < count; i += 2 { | ||
| if idx < count { | ||
| order[idx] = i | ||
| idx++ | ||
| } | ||
| } | ||
| case "worst_case": | ||
| // A scattered pattern that stresses segment bookkeeping | ||
| mid := count / 2 | ||
| order[0] = mid | ||
| left, right := mid-1, mid+1 | ||
| idx := 1 | ||
|
|
||
| // Alternate left and right from middle | ||
| for left >= 0 || right < count { | ||
| if left >= 0 && idx < count { | ||
| order[idx] = left | ||
| idx++ | ||
| left-- | ||
| } | ||
| if right < count && idx < count { | ||
| order[idx] = right | ||
| idx++ | ||
| right++ | ||
| } | ||
| } | ||
| case "random", "mixed": | ||
| // Deterministic shuffle using a fixed seed for reproducible benchmarks | ||
| for i := 0; i < count; i++ { | ||
| order[i] = i | ||
| } | ||
| r := rand.New(rand.NewSource(42)) | ||
| r.Shuffle(len(order), func(i, j int) { order[i], order[j] = order[j], order[i] }) | ||
| return order | ||
| default: | ||
| // Default to sequential | ||
| for i := 0; i < count; i++ { | ||
| order[i] = i | ||
| } | ||
| } | ||
|
|
||
| return order | ||
| } | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Suggestion: Insrtead of
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's the suggestion?