crypto - threaded K12: separate context computation from thread spawning (#25793)

jedisct1 · web-flow · commit ee4df4ad3eda · 2025-11-03T17:09:00.000+01:00
* threaded K12: separate context computation from thread spawning

Compute all contexts and store them in a pre-allocated array,
then spawn threads using the pre-computed contexts.

This ensures each context is fully materialized in memory with the
correct values before any thread tries to access it.

* kt128: unroll the permutation rounds only twice

This appears to deliver the best performance thanks to improved cache
utilization, and it’s consistent with what we already do for SHA3.
diff --git a/lib/std/crypto/kangarootwelve.zig b/lib/std/crypto/kangarootwelve.zig
@@ -230,58 +230,61 @@ fn keccakP1600timesN(comptime N: usize, states: *[5][5]@Vector(N, u64)) void {
         break :blk offsets;
     };
 
-    inline for (RC) |rc| {
-        // θ (theta)
-        var C: [5]@Vector(N, u64) = undefined;
-        inline for (0..5) |x| {
-            C[x] = states[x][0] ^ states[x][1] ^ states[x][2] ^ states[x][3] ^ states[x][4];
-        }
+    var round: usize = 0;
+    while (round < 12) : (round += 2) {
+        inline for (0..2) |i| {
+            // θ (theta)
+            var C: [5]@Vector(N, u64) = undefined;
+            inline for (0..5) |x| {
+                C[x] = states[x][0] ^ states[x][1] ^ states[x][2] ^ states[x][3] ^ states[x][4];
+            }
 
-        var D: [5]@Vector(N, u64) = undefined;
-        inline for (0..5) |x| {
-            D[x] = C[(x + 4) % 5] ^ rol64Vec(N, C[(x + 1) % 5], 1);
-        }
+            var D: [5]@Vector(N, u64) = undefined;
+            inline for (0..5) |x| {
+                D[x] = C[(x + 4) % 5] ^ rol64Vec(N, C[(x + 1) % 5], 1);
+            }
 
-        // Apply D to all lanes
-        inline for (0..5) |x| {
-            states[x][0] ^= D[x];
-            states[x][1] ^= D[x];
-            states[x][2] ^= D[x];
-            states[x][3] ^= D[x];
-            states[x][4] ^= D[x];
-        }
+            // Apply D to all lanes
+            inline for (0..5) |x| {
+                states[x][0] ^= D[x];
+                states[x][1] ^= D[x];
+                states[x][2] ^= D[x];
+                states[x][3] ^= D[x];
+                states[x][4] ^= D[x];
+            }
 
-        // ρ (rho) and π (pi) - optimized with pre-computed offsets
-        var current = states[1][0];
-        var px: usize = 1;
-        var py: usize = 0;
-        inline for (rho_offsets) |rot| {
-            const next_y = (2 * px + 3 * py) % 5;
-            const next = states[py][next_y];
-            states[py][next_y] = rol64Vec(N, current, rot);
-            current = next;
-            px = py;
-            py = next_y;
-        }
+            // ρ (rho) and π (pi) - optimized with pre-computed offsets
+            var current = states[1][0];
+            var px: usize = 1;
+            var py: usize = 0;
+            inline for (rho_offsets) |rot| {
+                const next_y = (2 * px + 3 * py) % 5;
+                const next = states[py][next_y];
+                states[py][next_y] = rol64Vec(N, current, rot);
+                current = next;
+                px = py;
+                py = next_y;
+            }
 
-        // χ (chi) - optimized with better register usage
-        inline for (0..5) |y| {
-            const t0 = states[0][y];
-            const t1 = states[1][y];
-            const t2 = states[2][y];
-            const t3 = states[3][y];
-            const t4 = states[4][y];
-
-            states[0][y] = t0 ^ (~t1 & t2);
-            states[1][y] = t1 ^ (~t2 & t3);
-            states[2][y] = t2 ^ (~t3 & t4);
-            states[3][y] = t3 ^ (~t4 & t0);
-            states[4][y] = t4 ^ (~t0 & t1);
-        }
+            // χ (chi) - optimized with better register usage
+            inline for (0..5) |y| {
+                const t0 = states[0][y];
+                const t1 = states[1][y];
+                const t2 = states[2][y];
+                const t3 = states[3][y];
+                const t4 = states[4][y];
+
+                states[0][y] = t0 ^ (~t1 & t2);
+                states[1][y] = t1 ^ (~t2 & t3);
+                states[2][y] = t2 ^ (~t3 & t4);
+                states[3][y] = t3 ^ (~t4 & t0);
+                states[4][y] = t4 ^ (~t0 & t1);
+            }
 
-        // ι (iota)
-        const rc_splat: @Vector(N, u64) = @splat(rc);
-        states[0][0] ^= rc_splat;
+            // ι (iota)
+            const rc_splat: @Vector(N, u64) = @splat(RC[round + i]);
+            states[0][0] ^= rc_splat;
+        }
     }
 }
 
@@ -323,46 +326,49 @@ fn keccakP(state: *[200]u8) void {
     }
 
     // Apply 12 rounds
-    inline for (RC) |rc| {
-        // θ
-        var C: [5]u64 = undefined;
-        inline for (0..5) |x| {
-            C[x] = lanes[x][0] ^ lanes[x][1] ^ lanes[x][2] ^ lanes[x][3] ^ lanes[x][4];
-        }
-        var D: [5]u64 = undefined;
-        inline for (0..5) |x| {
-            D[x] = C[(x + 4) % 5] ^ std.math.rotl(u64, C[(x + 1) % 5], 1);
-        }
-        inline for (0..5) |x| {
-            inline for (0..5) |y| {
-                lanes[x][y] ^= D[x];
+    var round: usize = 0;
+    while (round < 12) : (round += 2) {
+        inline for (0..2) |i| {
+            // θ
+            var C: [5]u64 = undefined;
+            inline for (0..5) |x| {
+                C[x] = lanes[x][0] ^ lanes[x][1] ^ lanes[x][2] ^ lanes[x][3] ^ lanes[x][4];
+            }
+            var D: [5]u64 = undefined;
+            inline for (0..5) |x| {
+                D[x] = C[(x + 4) % 5] ^ std.math.rotl(u64, C[(x + 1) % 5], 1);
+            }
+            inline for (0..5) |x| {
+                inline for (0..5) |y| {
+                    lanes[x][y] ^= D[x];
+                }
             }
-        }
 
-        // ρ and π
-        var current = lanes[1][0];
-        var px: usize = 1;
-        var py: usize = 0;
-        inline for (0..24) |t| {
-            const temp = lanes[py][(2 * px + 3 * py) % 5];
-            const rot_amount = ((t + 1) * (t + 2) / 2) % 64;
-            lanes[py][(2 * px + 3 * py) % 5] = std.math.rotl(u64, current, @as(u6, @intCast(rot_amount)));
-            current = temp;
-            const temp_x = py;
-            py = (2 * px + 3 * py) % 5;
-            px = temp_x;
-        }
+            // ρ and π
+            var current = lanes[1][0];
+            var px: usize = 1;
+            var py: usize = 0;
+            inline for (0..24) |t| {
+                const temp = lanes[py][(2 * px + 3 * py) % 5];
+                const rot_amount = ((t + 1) * (t + 2) / 2) % 64;
+                lanes[py][(2 * px + 3 * py) % 5] = std.math.rotl(u64, current, @as(u6, @intCast(rot_amount)));
+                current = temp;
+                const temp_x = py;
+                py = (2 * px + 3 * py) % 5;
+                px = temp_x;
+            }
 
-        // χ
-        inline for (0..5) |y| {
-            const T = [5]u64{ lanes[0][y], lanes[1][y], lanes[2][y], lanes[3][y], lanes[4][y] };
-            inline for (0..5) |x| {
-                lanes[x][y] = T[x] ^ (~T[(x + 1) % 5] & T[(x + 2) % 5]);
+            // χ
+            inline for (0..5) |y| {
+                const T = [5]u64{ lanes[0][y], lanes[1][y], lanes[2][y], lanes[3][y], lanes[4][y] };
+                inline for (0..5) |x| {
+                    lanes[x][y] = T[x] ^ (~T[(x + 1) % 5] & T[(x + 2) % 5]);
+                }
             }
-        }
 
-        // ι
-        lanes[0][0] ^= rc;
+            // ι
+            lanes[0][0] ^= RC[round + i];
+        }
     }
 
     // Store lanes back to state
@@ -759,32 +765,37 @@ fn ktMultiThreaded(
     const all_scratch = try allocator.alloc(u8, thread_count * scratch_size);
     defer allocator.free(all_scratch);
 
-    var group: Io.Group = .init;
+    const contexts = try allocator.alloc(LeafBatchContext, thread_count);
+    defer allocator.free(contexts);
+
     var leaves_assigned: usize = 0;
-    var thread_idx: usize = 0;
+    var context_count: usize = 0;
 
     while (leaves_assigned < total_leaves) {
         const batch_count = @min(leaves_per_thread, total_leaves - leaves_assigned);
         const batch_start = chunk_size + leaves_assigned * chunk_size;
         const cvs_offset = leaves_assigned * cv_size;
 
-        const ctx = LeafBatchContext{
+        contexts[context_count] = LeafBatchContext{
             .output_cvs = cvs[cvs_offset .. cvs_offset + batch_count * cv_size],
             .batch_start = batch_start,
             .batch_count = batch_count,
             .view = view,
-            .scratch_buffer = all_scratch[thread_idx * scratch_size .. (thread_idx + 1) * scratch_size],
+            .scratch_buffer = all_scratch[context_count * scratch_size .. (context_count + 1) * scratch_size],
             .total_len = total_len,
         };
 
+        leaves_assigned += batch_count;
+        context_count += 1;
+    }
+
+    var group: Io.Group = .init;
+    for (contexts[0..context_count]) |ctx| {
         group.async(io, struct {
             fn process(c: LeafBatchContext) void {
                 processLeafBatch(Variant, c);
             }
         }.process, .{ctx});
-
-        leaves_assigned += batch_count;
-        thread_idx += 1;
     }
 
     // Wait for all threads to complete