Unroll the zeroing loop

davidchisnall · davidchisnall · commit bc5fa119c9c0 · 2023-03-31T09:23:00.000+03:00
Zeroing the stack is a huge part of the total cost of cross-compartment calls. Running on Sail (reporting retired instructions), the compartment-switcher benchmark (from #37) reports (stack size, call+return, call, return): 0x100 213 127 86 0x200 405 223 182 0x400 789 415 374 0x800 1557 799 758 0x1000 3093 1567 1526 If we skip stack zeroing, the numbers are 95, 67, 28 for all of them. This means that, even with tiny stacks, zeroing is more than half of the cost of a compartment switch and that cost grows with larger compartments. We currently require three instructions per store in the zeroing loop. This commit unrolls the loop so that we zero 32 bytes at a time as long as it can, then 16 bytes. The ABI mandates 16-byte alignment for the stack, so we now force the top and bottom of the callee's stack chunk to be 16-byte aligned and assume that we're under attack if this is not the case. It turned out that the loader was not guaranteeing this (though it happened to be true for everything in the repo except the allocator benchmark, which is a special snowflake). The loader now rounds all [trusted] stack allocations up to a multiple of 16 bytes. The results are now: 0x100 166 106 60 0x200 262 154 108 0x400 454 250 204 0x800 838 442 396 0x1000 1606 826 780 512 bytes is probably the smallest stack size that makes sense for a thread and here we see almost a 25% speedup. This has a fairly noticeable impact on the test suite too: Before: Test runner: All tests finished in 2385962 cycles After: Test runner: All tests finished in 2082193 cycles
diff --git a/sdk/core/loader/boot.cc b/sdk/core/loader/boot.cc
@@ -35,15 +35,30 @@ namespace
 	};
 	constexpr ThreadConfig ThreadConfigs[] = CONFIG_THREADS;
 
+	/**
+	 * Round up to a multiple of `Multiple`, which must be a power of two.
+	 */
+	template<size_t Multiple>
+	constexpr size_t round_up(size_t value)
+	{
+		static_assert((Multiple & (Multiple - 1)) == 0,
+		              "Multiple must be a power of two");
+		return (value + Multiple - 1) & -Multiple;
+	}
+	static_assert(round_up<16>(15) == 16);
+	static_assert(round_up<16>(28) == 32);
+	static_assert(round_up<8>(17) == 24);
+
 	template<size_t N>
 	constexpr size_t total_stacksize(const ThreadConfig (&configs)[N])
 	{
 		size_t ret = 0;
 		for (const auto &config : configs)
 		{
-			ret += config.stackSize;
-			ret += sizeof(TrustedStack) +
-			       sizeof(TrustedStackFrame) * config.trustedStackFrames;
+			ret += round_up<16>(config.stackSize);
+			ret +=
+			  round_up<16>(sizeof(TrustedStack) + sizeof(TrustedStackFrame) *
+			                                        config.trustedStackFrames);
 		}
 		return ret;
 	}
@@ -54,8 +69,12 @@ namespace
 	static_assert(
 	  CheckSize<BOOT_TSTACK_SIZE, sizeof(TrustedStackGeneric<0>)>::Value,
 	  "Boot trusted stack sizes do not match.");
-	alignas(TrustedStack) char __section(".thread_stacks")
+	// Stacks must be 16-byte aligned, so ensure that this is 16-byte aligned.
+	alignas(16) char __section(".thread_stacks")
 	  stackSpace[total_stacksize(ThreadConfigs)];
+	// It must also be aligned sufficiently for trusted stacks, so ensure that
+	// we've captured that requirement above.
+	static_assert(alignof(TrustedStack) <= 16);
 	__END_DECLS
 
 	static_assert(
@@ -723,6 +742,8 @@ namespace
 		                       false>(LA_ABS(stackSpace), sizeof(stackSpace));
 		/// Allocate some space from the pool.
 		auto allocate = [&](size_t size) {
+			Debug::log("Rounded up {} to {}", size, round_up<16>(size));
+			size         = round_up<16>(size);
 			auto ret     = stackArea;
 			ret.bounds() = size;
 			stackArea.address() += size;
@@ -734,7 +755,13 @@ namespace
 			// Trusted stack root has both global and store local,
 			ret.permissions() &= ret.permissions().without(Permission::Global);
 			// Move the pointer to the top for stack usage.
-			ret.address() += size;
+			Debug::Invariant((ret.address() & 0xf) == 0,
+			                 "Stack is not 16-byte aligned: {}",
+			                 ret);
+			ret.address() += ret.length();
+			Debug::Invariant((ret.address() & 0xf) == 0,
+			                 "Stack is not 16-byte aligned: {}",
+			                 ret);
 			return ret;
 		};
 		/// Allocate a trusted stack from the pool.
diff --git a/sdk/core/switcher/entry.S b/sdk/core/switcher/entry.S
@@ -95,6 +95,40 @@ switcher_scheduler_entry_csp:
 	// make sure the caller's CSP is unsealed
 	cgettype           t2, \reg
 	bnez               t2, .Lforce_unwind
+	// Check that the base is 16-byte aligned
+	cgetbase           t2, csp
+	andi               t2, t2, 0xf
+	bnez               t2, .Lforce_unwind
+	// Check that the address (top of the remainder) is 16-byte aligned
+	andi               t2, sp, 0xf
+	bnez               t2, .Lforce_unwind
+.endm
+
+/**
+ * Zero the stack.  The three operands are the base address (modified during
+ * this call, will point at the top at the end), the top address, and a scratch
+ * register to use.  The base must be a capability but it must be provided
+ * without the c prefix because it is used as both a capability and integer
+ * register.  Top and scratch are both clobbered.
+ */
+.macro zero_stack base top scratch
+	addi               \scratch, \top, -32
+	addi               \top, \top, -16
+	bgt                \base, \scratch, 1f
+	// Zero the stack in 32-byte chunks
+0:
+	csc                cnull, 0(c\base)
+	csc                cnull, 8(c\base)
+	csc                cnull, 16(c\base)
+	csc                cnull, 24(c\base)
+	cincoffset         c\base, c\base, 32
+	ble                \base, \scratch, 0b
+1:
+	bgt                \base, \top, 2f
+	// Zero any 16-byte tail
+	csc                cnull, 0(c\base)
+	csc                cnull, 8(c\base)
+2:
 .endm
 
 	.section .text, "ax", @progbits
@@ -148,12 +182,7 @@ compartment_switcher_entry:
 	csetaddr           csp, csp, s1
 	sub                s1, s0, s1
 	csetboundsexact    csp, csp, s1
-	bge                sp, s0, .Lout
-	// Zero the part that the caller offers to the callee.
-.Lzero_loop:
-	csc                c0, 0(csp)
-	cincoffset         csp, csp, 8
-	blt                sp, s0, .Lzero_loop
+	zero_stack         sp, s0, gp
 #endif // CONFIG_NO_SWITCHER_SAFETY
 .Lout:
 	// Fetch the sealing key
@@ -668,12 +697,6 @@ exception_entry_asm:
 	cgetbase           tp, csp
 	cgetaddr           t1, csp
 	csetaddr           ct2, csp, tp
-	bge                t2, t1, .Lout2
-	// Zero the stack used by the callee.
-.Lzero_loop2:
-	csc                c0, 0(ct2)
-	cincoffset         ct2, ct2, 8
-	blt                t2, t1, .Lzero_loop2
-.Lout2:
+	zero_stack         t2, t1, tp
 #endif // CONFIG_NO_SWITCHER_SAFETY
 	cret