Skip to content

Commit bc5fa11

Browse files
committed
Unroll the zeroing loop
Zeroing the stack is a huge part of the total cost of cross-compartment calls. Running on Sail (reporting retired instructions), the compartment-switcher benchmark (from #37) reports (stack size, call+return, call, return): 0x100 213 127 86 0x200 405 223 182 0x400 789 415 374 0x800 1557 799 758 0x1000 3093 1567 1526 If we skip stack zeroing, the numbers are 95, 67, 28 for all of them. This means that, even with tiny stacks, zeroing is more than half of the cost of a compartment switch and that cost grows with larger compartments. We currently require three instructions per store in the zeroing loop. This commit unrolls the loop so that we zero 32 bytes at a time as long as it can, then 16 bytes. The ABI mandates 16-byte alignment for the stack, so we now force the top and bottom of the callee's stack chunk to be 16-byte aligned and assume that we're under attack if this is not the case. It turned out that the loader was not guaranteeing this (though it happened to be true for everything in the repo except the allocator benchmark, which is a special snowflake). The loader now rounds all [trusted] stack allocations up to a multiple of 16 bytes. The results are now: 0x100 166 106 60 0x200 262 154 108 0x400 454 250 204 0x800 838 442 396 0x1000 1606 826 780 512 bytes is probably the smallest stack size that makes sense for a thread and here we see almost a 25% speedup. This has a fairly noticeable impact on the test suite too: Before: Test runner: All tests finished in 2385962 cycles After: Test runner: All tests finished in 2082193 cycles
1 parent f9d9281 commit bc5fa11

File tree

2 files changed

+68
-18
lines changed

2 files changed

+68
-18
lines changed

sdk/core/loader/boot.cc

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,15 +35,30 @@ namespace
3535
};
3636
constexpr ThreadConfig ThreadConfigs[] = CONFIG_THREADS;
3737

38+
/**
39+
* Round up to a multiple of `Multiple`, which must be a power of two.
40+
*/
41+
template<size_t Multiple>
42+
constexpr size_t round_up(size_t value)
43+
{
44+
static_assert((Multiple & (Multiple - 1)) == 0,
45+
"Multiple must be a power of two");
46+
return (value + Multiple - 1) & -Multiple;
47+
}
48+
static_assert(round_up<16>(15) == 16);
49+
static_assert(round_up<16>(28) == 32);
50+
static_assert(round_up<8>(17) == 24);
51+
3852
template<size_t N>
3953
constexpr size_t total_stacksize(const ThreadConfig (&configs)[N])
4054
{
4155
size_t ret = 0;
4256
for (const auto &config : configs)
4357
{
44-
ret += config.stackSize;
45-
ret += sizeof(TrustedStack) +
46-
sizeof(TrustedStackFrame) * config.trustedStackFrames;
58+
ret += round_up<16>(config.stackSize);
59+
ret +=
60+
round_up<16>(sizeof(TrustedStack) + sizeof(TrustedStackFrame) *
61+
config.trustedStackFrames);
4762
}
4863
return ret;
4964
}
@@ -54,8 +69,12 @@ namespace
5469
static_assert(
5570
CheckSize<BOOT_TSTACK_SIZE, sizeof(TrustedStackGeneric<0>)>::Value,
5671
"Boot trusted stack sizes do not match.");
57-
alignas(TrustedStack) char __section(".thread_stacks")
72+
// Stacks must be 16-byte aligned, so ensure that this is 16-byte aligned.
73+
alignas(16) char __section(".thread_stacks")
5874
stackSpace[total_stacksize(ThreadConfigs)];
75+
// It must also be aligned sufficiently for trusted stacks, so ensure that
76+
// we've captured that requirement above.
77+
static_assert(alignof(TrustedStack) <= 16);
5978
__END_DECLS
6079

6180
static_assert(
@@ -723,6 +742,8 @@ namespace
723742
false>(LA_ABS(stackSpace), sizeof(stackSpace));
724743
/// Allocate some space from the pool.
725744
auto allocate = [&](size_t size) {
745+
Debug::log("Rounded up {} to {}", size, round_up<16>(size));
746+
size = round_up<16>(size);
726747
auto ret = stackArea;
727748
ret.bounds() = size;
728749
stackArea.address() += size;
@@ -734,7 +755,13 @@ namespace
734755
// Trusted stack root has both global and store local,
735756
ret.permissions() &= ret.permissions().without(Permission::Global);
736757
// Move the pointer to the top for stack usage.
737-
ret.address() += size;
758+
Debug::Invariant((ret.address() & 0xf) == 0,
759+
"Stack is not 16-byte aligned: {}",
760+
ret);
761+
ret.address() += ret.length();
762+
Debug::Invariant((ret.address() & 0xf) == 0,
763+
"Stack is not 16-byte aligned: {}",
764+
ret);
738765
return ret;
739766
};
740767
/// Allocate a trusted stack from the pool.

sdk/core/switcher/entry.S

Lines changed: 36 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,40 @@ switcher_scheduler_entry_csp:
9595
// make sure the caller's CSP is unsealed
9696
cgettype t2, \reg
9797
bnez t2, .Lforce_unwind
98+
// Check that the base is 16-byte aligned
99+
cgetbase t2, csp
100+
andi t2, t2, 0xf
101+
bnez t2, .Lforce_unwind
102+
// Check that the address (top of the remainder) is 16-byte aligned
103+
andi t2, sp, 0xf
104+
bnez t2, .Lforce_unwind
105+
.endm
106+
107+
/**
108+
* Zero the stack. The three operands are the base address (modified during
109+
* this call, will point at the top at the end), the top address, and a scratch
110+
* register to use. The base must be a capability but it must be provided
111+
* without the c prefix because it is used as both a capability and integer
112+
* register. Top and scratch are both clobbered.
113+
*/
114+
.macro zero_stack base top scratch
115+
addi \scratch, \top, -32
116+
addi \top, \top, -16
117+
bgt \base, \scratch, 1f
118+
// Zero the stack in 32-byte chunks
119+
0:
120+
csc cnull, 0(c\base)
121+
csc cnull, 8(c\base)
122+
csc cnull, 16(c\base)
123+
csc cnull, 24(c\base)
124+
cincoffset c\base, c\base, 32
125+
ble \base, \scratch, 0b
126+
1:
127+
bgt \base, \top, 2f
128+
// Zero any 16-byte tail
129+
csc cnull, 0(c\base)
130+
csc cnull, 8(c\base)
131+
2:
98132
.endm
99133

100134
.section .text, "ax", @progbits
@@ -148,12 +182,7 @@ compartment_switcher_entry:
148182
csetaddr csp, csp, s1
149183
sub s1, s0, s1
150184
csetboundsexact csp, csp, s1
151-
bge sp, s0, .Lout
152-
// Zero the part that the caller offers to the callee.
153-
.Lzero_loop:
154-
csc c0, 0(csp)
155-
cincoffset csp, csp, 8
156-
blt sp, s0, .Lzero_loop
185+
zero_stack sp, s0, gp
157186
#endif // CONFIG_NO_SWITCHER_SAFETY
158187
.Lout:
159188
// Fetch the sealing key
@@ -668,12 +697,6 @@ exception_entry_asm:
668697
cgetbase tp, csp
669698
cgetaddr t1, csp
670699
csetaddr ct2, csp, tp
671-
bge t2, t1, .Lout2
672-
// Zero the stack used by the callee.
673-
.Lzero_loop2:
674-
csc c0, 0(ct2)
675-
cincoffset ct2, ct2, 8
676-
blt t2, t1, .Lzero_loop2
677-
.Lout2:
700+
zero_stack t2, t1, tp
678701
#endif // CONFIG_NO_SWITCHER_SAFETY
679702
cret

0 commit comments

Comments
 (0)