From 1910960964475cc4b10608a32f8aea46327fbf4b Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Wed, 7 May 2025 19:16:23 +0800 Subject: [PATCH 1/3] Run Neon NTT through SLOTHY and add Makefile This adds a Makefile that runs the Neon NTT through SLOTHY. To accomodate this the clean assembly is moved to dev/aarch64_clean/, while the mldsa/native/aarch64 contains the optimized assembly. The main difference to mlkem-native is that we need set an explicit timeout as optimizing the second loop doesn't result reasonable performance, but a good solution is found within one minute on my Apple M4. I set the timeout to 2 minutes with the hope that it works on most platforms. We have have to increase that later. For now the clean backend is not tested in CI - that's left for a follow-up PR. SLOTHY is also not run in CI, yet. We probably want to put the assembly simplification scripts in place so we can follow the same structure as in mlkem-native. Signed-off-by: Matthias J. Kannwischer --- dev/aarch64_clean/src/ntt.S | 305 +++++++ mldsa/native/aarch64/src/Makefile | 42 + mldsa/native/aarch64/src/ntt.S | 1244 ++++++++++++++++++++++++++--- 3 files changed, 1498 insertions(+), 93 deletions(-) create mode 100644 dev/aarch64_clean/src/ntt.S create mode 100644 mldsa/native/aarch64/src/Makefile diff --git a/dev/aarch64_clean/src/ntt.S b/dev/aarch64_clean/src/ntt.S new file mode 100644 index 00000000..edbc9f62 --- /dev/null +++ b/dev/aarch64_clean/src/ntt.S @@ -0,0 +1,305 @@ +/* Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Hanno Becker + * Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_AARCH64) + +.macro mulmodq dst, src, const, idx0, idx1 + sqrdmulh t2.4s, \src\().4s, \const\().s[\idx1\()] + mul \dst\().4s, \src\().4s, \const\().s[\idx0\()] + mls \dst\().4s, t2.4s, consts.s[0] +.endm + +.macro mulmod dst, src, const, const_twisted + sqrdmulh t2.4s, \src\().4s, \const_twisted\().4s + mul \dst\().4s, \src\().4s, \const\().4s + mls \dst\().4s, t2.4s, consts.s[0] +.endm + +.macro ct_butterfly a, b, root, idx0, idx1 + mulmodq tmp, \b, \root, \idx0, \idx1 + sub \b\().4s, \a\().4s, tmp.4s + add \a\().4s, \a\().4s, tmp.4s +.endm + +.macro ct_butterfly_v a, b, root, root_twisted + mulmod tmp, \b, \root, \root_twisted + sub \b\().4s, \a\().4s, tmp.4s + add \a\().4s, \a\().4s, tmp.4s +.endm + +.macro load_roots_123 + ldr q_root0, [r012345_ptr], #64 + ldr q_root1, [r012345_ptr, #(-64 + 16)] + ldr q_root2, [r012345_ptr, #(-64 + 32)] + ldr q_root3, [r012345_ptr, #(-64 + 48)] +.endm + +.macro load_roots_456 + ldr q_root0, [r012345_ptr], #64 + ldr q_root1, [r012345_ptr, #(-64 + 16)] + ldr q_root2, [r012345_ptr, #(-64 + 32)] + ldr q_root3, [r012345_ptr, #(-64 + 48)] +.endm + +.macro load_roots_78_part1 + ldr q_root0, [r67_ptr], #(12*16) + ldr q_root0_tw, [r67_ptr, #(-12*16 + 1*16)] + ldr q_root1, [r67_ptr, #(-12*16 + 2*16)] + ldr q_root1_tw, [r67_ptr, #(-12*16 + 3*16)] + ldr q_root2, [r67_ptr, #(-12*16 + 4*16)] + ldr q_root2_tw, [r67_ptr, #(-12*16 + 5*16)] +.endm + +.macro load_roots_78_part2 + ldr q_root0, [r67_ptr, #(-12*16 + 6*16)] + ldr q_root0_tw, [r67_ptr, #(-12*16 + 7*16)] + ldr q_root1, [r67_ptr, #(-12*16 + 8*16)] + ldr q_root1_tw, [r67_ptr, #(-12*16 + 9*16)] + ldr q_root2, [r67_ptr, #(-12*16 + 10*16)] + ldr q_root2_tw, [r67_ptr, #(-12*16 + 11*16)] +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro save_vregs + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +.macro push_stack + save_vregs +.endm + +.macro pop_stack + restore_vregs +.endm + + // Inputs + in .req x0 // Input/output buffer + r012345_ptr .req x1 // twiddles for layer 0,1,2,3,4,5 + r67_ptr .req x2 // twiddles for layer 6,7 + + count .req x3 + inp .req x4 + inpp .req x5 + xtmp .req x6 + wtmp .req w6 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + + q_data0 .req q9 + q_data1 .req q10 + q_data2 .req q11 + q_data3 .req q12 + q_data4 .req q13 + q_data5 .req q14 + q_data6 .req q15 + q_data7 .req q16 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + + q_root0 .req q0 + q_root1 .req q1 + q_root2 .req q2 + q_root3 .req q3 + + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + q_root0_tw .req q4 + q_root1_tw .req q5 + q_root2_tw .req q6 + q_root3_tw .req q7 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + consts .req v8 + q_consts .req q8 + +.text +.global MLD_ASM_NAMESPACE(ntt_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(ntt_asm) + push_stack + + // load q = 8380417 + movz wtmp, #57345 + movk wtmp, #127, lsl #16 + dup consts.4s, wtmp + + mov inp, in + mov count, #8 + + load_roots_123 + + .p2align 2 +layer123_start: + ldr q_data0, [in, #(0*(1024/8))] + ldr q_data1, [in, #(1*(1024/8))] + ldr q_data2, [in, #(2*(1024/8))] + ldr q_data3, [in, #(3*(1024/8))] + ldr q_data4, [in, #(4*(1024/8))] + ldr q_data5, [in, #(5*(1024/8))] + ldr q_data6, [in, #(6*(1024/8))] + ldr q_data7, [in, #(7*(1024/8))] + + ct_butterfly data0, data4, root0, 0, 1 + ct_butterfly data1, data5, root0, 0, 1 + ct_butterfly data2, data6, root0, 0, 1 + ct_butterfly data3, data7, root0, 0, 1 + + ct_butterfly data0, data2, root0, 2, 3 + ct_butterfly data1, data3, root0, 2, 3 + ct_butterfly data4, data6, root1, 0, 1 + ct_butterfly data5, data7, root1, 0, 1 + + ct_butterfly data0, data1, root1, 2, 3 + ct_butterfly data2, data3, root2, 0, 1 + ct_butterfly data4, data5, root2, 2, 3 + ct_butterfly data6, data7, root3, 0, 1 + + str q_data0, [in], #16 + str q_data1, [in, #(-16 + 1*(1024/8))] + str q_data2, [in, #(-16 + 2*(1024/8))] + str q_data3, [in, #(-16 + 3*(1024/8))] + str q_data4, [in, #(-16 + 4*(1024/8))] + str q_data5, [in, #(-16 + 5*(1024/8))] + str q_data6, [in, #(-16 + 6*(1024/8))] + str q_data7, [in, #(-16 + 7*(1024/8))] + + subs count, count, #1 + cbnz count, layer123_start + + mov in, inp + add inpp, in, #64 + mov count, #8 + + // Use two data pointers and carefully arrange + // increments to facilitate reordering of loads + // and stores by SLOTHY. + // + // TODO: Think of alternatives here -- the start with `in` + // pointing to 64 byte below the actual data, which in theory + // could underflow. It's unclear how the CPU would behave in this case. + sub in, in, #64 + sub inpp, inpp, #64 + + .p2align 2 +layer45678_start: + ldr q_data0, [in, #(64 + 16*0)] + ldr q_data1, [in, #(64 + 16*1)] + ldr q_data2, [in, #(64 + 16*2)] + ldr q_data3, [in, #(64 + 16*3)] + ldr q_data4, [inpp, #(64 + 16*0)] + ldr q_data5, [inpp, #(64 + 16*1)] + ldr q_data6, [inpp, #(64 + 16*2)] + ldr q_data7, [inpp, #(64 + 16*3)] + + add in, in, #64 + add inpp, inpp, #64 + + load_roots_456 + + ct_butterfly data0, data4, root0, 0, 1 + ct_butterfly data1, data5, root0, 0, 1 + ct_butterfly data2, data6, root0, 0, 1 + ct_butterfly data3, data7, root0, 0, 1 + + ct_butterfly data0, data2, root0, 2, 3 + ct_butterfly data1, data3, root0, 2, 3 + ct_butterfly data4, data6, root1, 0, 1 + ct_butterfly data5, data7, root1, 0, 1 + + ct_butterfly data0, data1, root1, 2, 3 + ct_butterfly data2, data3, root2, 0, 1 + ct_butterfly data4, data5, root2, 2, 3 + ct_butterfly data6, data7, root3, 0, 1 + + // Transpose using trn + transpose4 data0, data1, data2, data3 + transpose4 data4, data5, data6, data7 + + load_roots_78_part1 + + ct_butterfly_v data0, data2, root0, root0_tw + ct_butterfly_v data1, data3, root0, root0_tw + ct_butterfly_v data0, data1, root1, root1_tw + ct_butterfly_v data2, data3, root2, root2_tw + + load_roots_78_part2 + + ct_butterfly_v data4, data6, root0, root0_tw + ct_butterfly_v data5, data7, root0, root0_tw + ct_butterfly_v data4, data5, root1, root1_tw + ct_butterfly_v data6, data7, root2, root2_tw + + // Transpose as part of st4 + st4 {data0.4S, data1.4S, data2.4S, data3.4S}, [in], #64 + st4 {data4.4S, data5.4S, data6.4S, data7.4S}, [inpp], #64 + + subs count, count, #1 + cbnz count, layer45678_start + + pop_stack + ret + +#endif /* MLD_ARITH_BACKEND_AARCH64 */ diff --git a/mldsa/native/aarch64/src/Makefile b/mldsa/native/aarch64/src/Makefile new file mode 100644 index 00000000..8e39f142 --- /dev/null +++ b/mldsa/native/aarch64/src/Makefile @@ -0,0 +1,42 @@ +# Copyright (c) The mldsa-native project authors +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +###### +# To run, see the README.md file +###### +.PHONY: all clean + +# ISA to optimize for +TARGET_ISA=Arm_AArch64 + +# MicroArch target to optimize for +TARGET_MICROARCH=Arm_Cortex_A55 + +SLOTHY_EXTRA_FLAGS ?= + +SLOTHY_FLAGS=-c sw_pipelining.enabled=true \ + -c inputs_are_outputs \ + -c sw_pipelining.minimize_overlapping=False \ + -c sw_pipelining.allow_post \ + -c variable_size \ + -c constraints.stalls_first_attempt=64 \ + -c timeout=120 \ + $(SLOTHY_EXTRA_FLAGS) + +# For kernels which stash callee-saved v8-v15 but don't stash callee-saved GPRs x19-x30. +# Allow SLOTHY to use all V-registers, but only caller-saved GPRs. +RESERVE_X_ONLY_FLAG=-c reserved_regs="[x18--x30,sp]" + +# Used for kernels which don't stash callee-saved registers. +# Restrict SLOTHY to caller-saved registers. +RESERVE_ALL_FLAG=-c reserved_regs="[x18--x30,sp,v8--v15]" + +all: ntt.S + +# These units explicitly save and restore registers v8-v15, so SLOTHY can freely use +# those registers. +ntt.S: ../../../../dev/aarch64_clean/src/ntt.S + slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l layer123_start -l layer45678_start $(SLOTHY_FLAGS) $(RESERVE_X_ONLY_FLAG) + +clean: + -$(RM) -rf *.S diff --git a/mldsa/native/aarch64/src/ntt.S b/mldsa/native/aarch64/src/ntt.S index c9f091fb..4c1f54d4 100644 --- a/mldsa/native/aarch64/src/ntt.S +++ b/mldsa/native/aarch64/src/ntt.S @@ -74,12 +74,12 @@ .endm .macro load_roots_78_part2 - ldr q_root0, [r67_ptr, (-12*16 + 6*16)] - ldr q_root0_tw, [r67_ptr, (-12*16 + 7*16)] - ldr q_root1, [r67_ptr, (-12*16 + 8*16)] - ldr q_root1_tw, [r67_ptr, (-12*16 + 9*16)] - ldr q_root2, [r67_ptr, (-12*16 + 10*16)] - ldr q_root2_tw, [r67_ptr, (-12*16 + 11*16)] + ldr q_root0, [r67_ptr, #(-12*16 + 6*16)] + ldr q_root0_tw, [r67_ptr, #(-12*16 + 7*16)] + ldr q_root1, [r67_ptr, #(-12*16 + 8*16)] + ldr q_root1_tw, [r67_ptr, #(-12*16 + 9*16)] + ldr q_root2, [r67_ptr, #(-12*16 + 10*16)] + ldr q_root2_tw, [r67_ptr, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -192,42 +192,520 @@ MLD_ASM_FN_SYMBOL(ntt_asm) load_roots_123 .p2align 2 + // Instructions: 133 + // Expected cycles: 149 + // Expected IPC: 0.89 + // + // Cycle bound: 149.0 + // IPC bound: 0.89 + // + // Wall time: 1.34s + // User time: 1.34s + // + // ----------------------------------------------------------------- cycle (expected) -----------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|----------------------- + ldr q21, [x0, #896] // *.................................................................................................................................................... + ldr q9, [x0, #640] // ..*.................................................................................................................................................. + mul v18.4S, v21.4S, v0.S[0] // ....*................................................................................................................................................ + sqrdmulh v24.4S, v21.4S, v0.S[1] // .....*............................................................................................................................................... + sqrdmulh v6.4S, v9.4S, v0.S[1] // ......*.............................................................................................................................................. + mul v20.4S, v9.4S, v0.S[0] // .......*............................................................................................................................................. + ldr q28, [x0, #768] // ........*............................................................................................................................................ + ldr q17, [x0, #384] // ..........*.......................................................................................................................................... + mls v20.4S, v6.4S, v8.S[0] // ............*........................................................................................................................................ + mls v18.4S, v24.4S, v8.S[0] // .............*....................................................................................................................................... + ldr q6, [x0, #912] // ..............*...................................................................................................................................... + sqrdmulh v25.4S, v28.4S, v0.S[1] // ................*.................................................................................................................................... + mul v26.4S, v28.4S, v0.S[0] // .................*................................................................................................................................... + sub v5.4S, v17.4S, v18.4S // ..................*.................................................................................................................................. + add v12.4S, v17.4S, v18.4S // ...................*................................................................................................................................. + mul v13.4S, v6.4S, v0.S[0] // ....................*................................................................................................................................ + ldr q22, [x0, #128] // .....................*............................................................................................................................... + sqrdmulh v10.4S, v6.4S, v0.S[1] // .......................*............................................................................................................................. + mls v26.4S, v25.4S, v8.S[0] // ........................*............................................................................................................................ + add v6.4S, v22.4S, v20.4S // .........................*........................................................................................................................... + sub v15.4S, v22.4S, v20.4S // ..........................*.......................................................................................................................... + mls v13.4S, v10.4S, v8.S[0] // ...........................*......................................................................................................................... + ldr q28, [x0, #512] // ............................*........................................................................................................................ + ldr q25, [x0, #256] // ..............................*...................................................................................................................... + ldr q4, [x0, #656] // ................................*.................................................................................................................... + ldr q27, [x0, #784] // ..................................*.................................................................................................................. + mul v31.4S, v4.4S, v0.S[0] // ....................................*................................................................................................................ + sqrdmulh v11.4S, v5.4S, v1.S[1] // .....................................*............................................................................................................... + mul v23.4S, v5.4S, v1.S[0] // ......................................*.............................................................................................................. + sqrdmulh v20.4S, v12.4S, v0.S[3] // .......................................*............................................................................................................. + mul v29.4S, v12.4S, v0.S[2] // ........................................*............................................................................................................ + ldr q14, [x0, #400] // .........................................*........................................................................................................... + mul v16.4S, v28.4S, v0.S[0] // ...........................................*......................................................................................................... + sqrdmulh v10.4S, v28.4S, v0.S[1] // ............................................*........................................................................................................ + sqrdmulh v28.4S, v4.4S, v0.S[1] // .............................................*....................................................................................................... + add v18.4S, v14.4S, v13.4S // ..............................................*...................................................................................................... + sqrdmulh v9.4S, v27.4S, v0.S[1] // ...............................................*..................................................................................................... + mul v5.4S, v27.4S, v0.S[0] // ................................................*.................................................................................................... + mls v31.4S, v28.4S, v8.S[0] // .................................................*................................................................................................... + mls v29.4S, v20.4S, v8.S[0] // ..................................................*.................................................................................................. + sub v20.4S, v14.4S, v13.4S // ...................................................*................................................................................................. + mls v23.4S, v11.4S, v8.S[0] // ....................................................*................................................................................................ + mls v16.4S, v10.4S, v8.S[0] // .....................................................*............................................................................................... + sub v19.4S, v6.4S, v29.4S // ......................................................*.............................................................................................. + add v14.4S, v6.4S, v29.4S // .......................................................*............................................................................................. + sub v6.4S, v15.4S, v23.4S // ........................................................*............................................................................................ + add v10.4S, v15.4S, v23.4S // .........................................................*........................................................................................... + sub v4.4S, v25.4S, v26.4S // ..........................................................*.......................................................................................... + add v11.4S, v25.4S, v26.4S // ...........................................................*......................................................................................... + ldr q30, [x0, #0] // ............................................................*........................................................................................ + sqrdmulh v24.4S, v10.4S, v2.S[3] // ..............................................................*...................................................................................... + mul v15.4S, v18.4S, v0.S[2] // ...............................................................*..................................................................................... + sqrdmulh v23.4S, v18.4S, v0.S[3] // ................................................................*.................................................................................... + ldr q13, [x0, #144] // .................................................................*................................................................................... + ldr q27, [x0, #528] // ...................................................................*................................................................................. + mls v5.4S, v9.4S, v8.S[0] // .....................................................................*............................................................................... + ldr q12, [x0, #272] // ......................................................................*.............................................................................. + sqrdmulh v18.4S, v20.4S, v1.S[1] // ........................................................................*............................................................................ + mul v7.4S, v20.4S, v1.S[0] // .........................................................................*........................................................................... + mls v15.4S, v23.4S, v8.S[0] // ..........................................................................*.......................................................................... + mul v20.4S, v27.4S, v0.S[0] // ...........................................................................*......................................................................... + sqrdmulh v29.4S, v27.4S, v0.S[1] // ............................................................................*........................................................................ + mul v22.4S, v11.4S, v0.S[2] // .............................................................................*....................................................................... + add v26.4S, v12.4S, v5.4S // ..............................................................................*...................................................................... + sqrdmulh v11.4S, v11.4S, v0.S[3] // ...............................................................................*..................................................................... + add v28.4S, v13.4S, v31.4S // ................................................................................*.................................................................... + mls v7.4S, v18.4S, v8.S[0] // .................................................................................*................................................................... + mul v18.4S, v4.4S, v1.S[0] // ..................................................................................*.................................................................. + sub v12.4S, v12.4S, v5.4S // ...................................................................................*................................................................. + sub v9.4S, v13.4S, v31.4S // ....................................................................................*................................................................ + sqrdmulh v5.4S, v4.4S, v1.S[1] // .....................................................................................*............................................................... + add v4.4S, v28.4S, v15.4S // ......................................................................................*.............................................................. + mls v20.4S, v29.4S, v8.S[0] // .......................................................................................*............................................................. + sqrdmulh v17.4S, v26.4S, v0.S[3] // ........................................................................................*............................................................ + sqrdmulh v21.4S, v14.4S, v1.S[3] // .........................................................................................*........................................................... + mul v31.4S, v14.4S, v1.S[2] // ..........................................................................................*.......................................................... + mls v22.4S, v11.4S, v8.S[0] // ...........................................................................................*......................................................... + mul v13.4S, v26.4S, v0.S[2] // ............................................................................................*........................................................ + ldr q27, [x0, #16] // .............................................................................................*....................................................... + add v11.4S, v9.4S, v7.4S // ...............................................................................................*..................................................... + add v14.4S, v30.4S, v16.4S // ................................................................................................*.................................................... + mls v18.4S, v5.4S, v8.S[0] // .................................................................................................*................................................... + mul v5.4S, v19.4S, v2.S[0] // ..................................................................................................*.................................................. + sqrdmulh v26.4S, v12.4S, v1.S[1] // ...................................................................................................*................................................. + mul v10.4S, v10.4S, v2.S[2] // ....................................................................................................*................................................ + mul v23.4S, v12.4S, v1.S[0] // .....................................................................................................*............................................... + sub v12.4S, v30.4S, v16.4S // ......................................................................................................*.............................................. + sqrdmulh v30.4S, v4.4S, v1.S[3] // .......................................................................................................*............................................. + sqrdmulh v19.4S, v19.4S, v2.S[1] // ........................................................................................................*............................................ + sqrdmulh v16.4S, v6.4S, v3.S[1] // .........................................................................................................*........................................... + mul v29.4S, v6.4S, v3.S[0] // ..........................................................................................................*.......................................... + sqrdmulh v25.4S, v11.4S, v2.S[3] // ...........................................................................................................*......................................... + mul v4.4S, v4.4S, v1.S[2] // ............................................................................................................*........................................ + mls v31.4S, v21.4S, v8.S[0] // .............................................................................................................*....................................... + mls v13.4S, v17.4S, v8.S[0] // ..............................................................................................................*...................................... + mls v29.4S, v16.4S, v8.S[0] // ...............................................................................................................*..................................... + add v6.4S, v27.4S, v20.4S // ................................................................................................................*.................................... + mul v11.4S, v11.4S, v2.S[2] // .................................................................................................................*................................... + add v17.4S, v14.4S, v22.4S // ..................................................................................................................*.................................. + mls v10.4S, v24.4S, v8.S[0] // ...................................................................................................................*................................. + mls v23.4S, v26.4S, v8.S[0] // ....................................................................................................................*................................ + add v26.4S, v12.4S, v18.4S // .....................................................................................................................*............................... + sub v24.4S, v27.4S, v20.4S // ......................................................................................................................*.............................. + mls v5.4S, v19.4S, v8.S[0] // .......................................................................................................................*............................. + mls v4.4S, v30.4S, v8.S[0] // ........................................................................................................................*............................ + add v21.4S, v26.4S, v10.4S // .........................................................................................................................*........................... + sub v16.4S, v12.4S, v18.4S // ..........................................................................................................................*.......................... + sub v27.4S, v14.4S, v22.4S // ...........................................................................................................................*......................... + add v30.4S, v6.4S, v13.4S // ............................................................................................................................*........................ + mls v11.4S, v25.4S, v8.S[0] // .............................................................................................................................*....................... + add v20.4S, v17.4S, v31.4S // ..............................................................................................................................*...................... + add v25.4S, v24.4S, v23.4S // ...............................................................................................................................*..................... + add v19.4S, v30.4S, v4.4S // ................................................................................................................................*.................... + sub v18.4S, v26.4S, v10.4S // .................................................................................................................................*................... + sub v14.4S, v28.4S, v15.4S // ..................................................................................................................................*.................. + sub v22.4S, v9.4S, v7.4S // ...................................................................................................................................*................. + str q20, [x0], #16 // ....................................................................................................................................*................ + sub v9.4S, v16.4S, v29.4S // .....................................................................................................................................*............... + sub v20.4S, v6.4S, v13.4S // ......................................................................................................................................*.............. + sub v26.4S, v17.4S, v31.4S // .......................................................................................................................................*............. + sqrdmulh v6.4S, v14.4S, v2.S[1] // ........................................................................................................................................*............ + sub v13.4S, v25.4S, v11.4S // .........................................................................................................................................*........... + str q21, [x0, #496] // ..........................................................................................................................................*.......... + sqrdmulh v10.4S, v22.4S, v3.S[1] // ...........................................................................................................................................*......... + str q18, [x0, #624] // ............................................................................................................................................*........ + sub v7.4S, v27.4S, v5.4S // .............................................................................................................................................*....... + str q9, [x0, #880] // ..............................................................................................................................................*...... + add v12.4S, v27.4S, v5.4S // ...............................................................................................................................................*..... + str q26, [x0, #112] // ................................................................................................................................................*.... + add v5.4S, v25.4S, v11.4S // .................................................................................................................................................*... + str q12, [x0, #240] // ..................................................................................................................................................*.. + sub v15.4S, v30.4S, v4.4S // ...................................................................................................................................................*. + str q19, [x0], #16 // ....................................................................................................................................................* + + // ----------------------------------------------------------------- cycle (expected) -----------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|----------------------- + // ldr q11, [x0, #256] // ..............................*...................................................................................................................... + // ldr q24, [x0, #768] // ........*............................................................................................................................................ + // ldr q7, [x0, #512] // ............................*........................................................................................................................ + // sqrdmulh v26.4S, v24.4S, v0.S[1] // ................*.................................................................................................................................... + // mul v23.4S, v24.4S, v0.S[0] // .................*................................................................................................................................... + // mul v18.4S, v7.4S, v0.S[0] // ...........................................*......................................................................................................... + // sqrdmulh v19.4S, v7.4S, v0.S[1] // ............................................*........................................................................................................ + // mls v23.4S, v26.4S, v8.S[0] // ........................*............................................................................................................................ + // ldr q7, [x0, #896] // *.................................................................................................................................................... + // mls v18.4S, v19.4S, v8.S[0] // .....................................................*............................................................................................... + // add v28.4S, v11.4S, v23.4S // ...........................................................*......................................................................................... + // sub v12.4S, v11.4S, v23.4S // ..........................................................*.......................................................................................... + // sqrdmulh v30.4S, v7.4S, v0.S[1] // .....*............................................................................................................................................... + // mul v26.4S, v7.4S, v0.S[0] // ....*................................................................................................................................................ + // ldr q17, [x0, #640] // ..*.................................................................................................................................................. + // ldr q31, [x0, #384] // ..........*.......................................................................................................................................... + // mls v26.4S, v30.4S, v8.S[0] // .............*....................................................................................................................................... + // mul v11.4S, v17.4S, v0.S[0] // .......*............................................................................................................................................. + // sqrdmulh v25.4S, v17.4S, v0.S[1] // ......*.............................................................................................................................................. + // ldr q19, [x0, #128] // .....................*............................................................................................................................... + // sub v27.4S, v31.4S, v26.4S // ..................*.................................................................................................................................. + // mls v11.4S, v25.4S, v8.S[0] // ............*........................................................................................................................................ + // ldr q22, [x0, #0] // ............................................................*........................................................................................ + // sqrdmulh v10.4S, v28.4S, v0.S[3] // ...............................................................................*..................................................................... + // mul v17.4S, v28.4S, v0.S[2] // .............................................................................*....................................................................... + // add v9.4S, v22.4S, v18.4S // ................................................................................................*.................................................... + // sub v24.4S, v22.4S, v18.4S // ......................................................................................................*.............................................. + // sqrdmulh v4.4S, v12.4S, v1.S[1] // .....................................................................................*............................................................... + // mul v23.4S, v12.4S, v1.S[0] // ..................................................................................*.................................................................. + // sqrdmulh v7.4S, v27.4S, v1.S[1] // .....................................*............................................................................................................... + // add v22.4S, v31.4S, v26.4S // ...................*................................................................................................................................. + // mul v26.4S, v27.4S, v1.S[0] // ......................................*.............................................................................................................. + // sub v12.4S, v19.4S, v11.4S // ..........................*.......................................................................................................................... + // mls v23.4S, v4.4S, v8.S[0] // .................................................................................................*................................................... + // sqrdmulh v14.4S, v22.4S, v0.S[3] // .......................................*............................................................................................................. + // mul v4.4S, v22.4S, v0.S[2] // ........................................*............................................................................................................ + // mls v26.4S, v7.4S, v8.S[0] // ....................................................*................................................................................................ + // add v27.4S, v19.4S, v11.4S // .........................*........................................................................................................................... + // mls v4.4S, v14.4S, v8.S[0] // ..................................................*.................................................................................................. + // mls v17.4S, v10.4S, v8.S[0] // ...........................................................................................*......................................................... + // sub v22.4S, v12.4S, v26.4S // ........................................................*............................................................................................ + // sub v14.4S, v27.4S, v4.4S // ......................................................*.............................................................................................. + // sqrdmulh v10.4S, v22.4S, v3.S[1] // .........................................................................................................*........................................... + // sqrdmulh v6.4S, v14.4S, v2.S[1] // ........................................................................................................*............................................ + // add v18.4S, v24.4S, v23.4S // .....................................................................................................................*............................... + // add v28.4S, v12.4S, v26.4S // .........................................................*........................................................................................... + // sub v20.4S, v9.4S, v17.4S // ...........................................................................................................................*......................... + // add v25.4S, v27.4S, v4.4S // .......................................................*............................................................................................. + // add v15.4S, v9.4S, v17.4S // ..................................................................................................................*.................................. + // mul v4.4S, v28.4S, v2.S[2] // ....................................................................................................*................................................ + // mul v11.4S, v25.4S, v1.S[2] // ..........................................................................................*.......................................................... + // sqrdmulh v31.4S, v25.4S, v1.S[3] // .........................................................................................*........................................................... + // sqrdmulh v26.4S, v28.4S, v2.S[3] // ..............................................................*...................................................................................... + // mls v11.4S, v31.4S, v8.S[0] // .............................................................................................................*....................................... + // mls v4.4S, v26.4S, v8.S[0] // ...................................................................................................................*................................. + // add v5.4S, v15.4S, v11.4S // ..............................................................................................................................*...................... + // sub v15.4S, v15.4S, v11.4S // .......................................................................................................................................*............. + // sub v13.4S, v18.4S, v4.4S // .................................................................................................................................*................... + // str q5, [x0], #16 // ....................................................................................................................................*................ + // add v5.4S, v18.4S, v4.4S // .........................................................................................................................*........................... + // sub v16.4S, v24.4S, v23.4S // ..........................................................................................................................*.......................... + // ldr q11, [x0, #256] // ......................................................................*.............................................................................. + // ldr q24, [x0, #768] // ..................................*.................................................................................................................. + // ldr q7, [x0, #512] // ...................................................................*................................................................................. + // sqrdmulh v26.4S, v24.4S, v0.S[1] // ...............................................*..................................................................................................... + // mul v23.4S, v24.4S, v0.S[0] // ................................................*.................................................................................................... + // mul v18.4S, v7.4S, v0.S[0] // ...........................................................................*......................................................................... + // sqrdmulh v19.4S, v7.4S, v0.S[1] // ............................................................................*........................................................................ + // mul v29.4S, v22.4S, v3.S[0] // ..........................................................................................................*.......................................... + // mls v23.4S, v26.4S, v8.S[0] // .....................................................................*............................................................................... + // ldr q7, [x0, #896] // ..............*...................................................................................................................................... + // mls v18.4S, v19.4S, v8.S[0] // .......................................................................................*............................................................. + // add v28.4S, v11.4S, v23.4S // ..............................................................................*...................................................................... + // sub v12.4S, v11.4S, v23.4S // ...................................................................................*................................................................. + // sqrdmulh v30.4S, v7.4S, v0.S[1] // .......................*............................................................................................................................. + // mul v26.4S, v7.4S, v0.S[0] // ....................*................................................................................................................................ + // ldr q17, [x0, #640] // ................................*.................................................................................................................... + // ldr q31, [x0, #384] // .........................................*........................................................................................................... + // mls v29.4S, v10.4S, v8.S[0] // ...............................................................................................................*..................................... + // mls v26.4S, v30.4S, v8.S[0] // ...........................*......................................................................................................................... + // mul v11.4S, v17.4S, v0.S[0] // ....................................*................................................................................................................ + // sqrdmulh v25.4S, v17.4S, v0.S[1] // .............................................*....................................................................................................... + // ldr q19, [x0, #128] // .................................................................*................................................................................... + // sub v27.4S, v31.4S, v26.4S // ...................................................*................................................................................................. + // mls v11.4S, v25.4S, v8.S[0] // .................................................*................................................................................................... + // ldr q22, [x0, #0] // .............................................................................................*....................................................... + // sqrdmulh v10.4S, v28.4S, v0.S[3] // ........................................................................................*............................................................ + // mul v17.4S, v28.4S, v0.S[2] // ............................................................................................*........................................................ + // add v9.4S, v22.4S, v18.4S // ................................................................................................................*.................................... + // sub v24.4S, v22.4S, v18.4S // ......................................................................................................................*.............................. + // sqrdmulh v4.4S, v12.4S, v1.S[1] // ...................................................................................................*................................................. + // mul v23.4S, v12.4S, v1.S[0] // .....................................................................................................*............................................... + // sqrdmulh v7.4S, v27.4S, v1.S[1] // ........................................................................*............................................................................ + // add v22.4S, v31.4S, v26.4S // ..............................................*...................................................................................................... + // mul v26.4S, v27.4S, v1.S[0] // .........................................................................*........................................................................... + // sub v12.4S, v19.4S, v11.4S // ....................................................................................*................................................................ + // mls v23.4S, v4.4S, v8.S[0] // ....................................................................................................................*................................ + // mul v18.4S, v14.4S, v2.S[0] // ..................................................................................................*.................................................. + // sqrdmulh v14.4S, v22.4S, v0.S[3] // ................................................................*.................................................................................... + // mul v4.4S, v22.4S, v0.S[2] // ...............................................................*..................................................................................... + // mls v26.4S, v7.4S, v8.S[0] // .................................................................................*................................................................... + // mls v18.4S, v6.4S, v8.S[0] // .......................................................................................................................*............................. + // add v27.4S, v19.4S, v11.4S // ................................................................................*.................................................................... + // mls v4.4S, v14.4S, v8.S[0] // ..........................................................................*.......................................................................... + // mls v17.4S, v10.4S, v8.S[0] // ..............................................................................................................*...................................... + // sub v7.4S, v20.4S, v18.4S // .............................................................................................................................................*....... + // sub v22.4S, v12.4S, v26.4S // ...................................................................................................................................*................. + // sub v14.4S, v27.4S, v4.4S // ..................................................................................................................................*.................. + // sub v30.4S, v16.4S, v29.4S // .....................................................................................................................................*............... + // sqrdmulh v10.4S, v22.4S, v3.S[1] // ...........................................................................................................................................*......... + // add v19.4S, v20.4S, v18.4S // ...............................................................................................................................................*..... + // sqrdmulh v6.4S, v14.4S, v2.S[1] // ........................................................................................................................................*............ + // add v18.4S, v24.4S, v23.4S // ...............................................................................................................................*..................... + // add v28.4S, v12.4S, v26.4S // ...............................................................................................*..................................................... + // sub v20.4S, v9.4S, v17.4S // ......................................................................................................................................*.............. + // str q15, [x0, #112] // ................................................................................................................................................*.... + // add v25.4S, v27.4S, v4.4S // ......................................................................................*.............................................................. + // add v15.4S, v9.4S, v17.4S // ............................................................................................................................*........................ + // mul v4.4S, v28.4S, v2.S[2] // .................................................................................................................*................................... + // mul v11.4S, v25.4S, v1.S[2] // ............................................................................................................*........................................ + // sqrdmulh v31.4S, v25.4S, v1.S[3] // .......................................................................................................*............................................. + // str q5, [x0, #496] // ..........................................................................................................................................*.......... + // sqrdmulh v26.4S, v28.4S, v2.S[3] // ...........................................................................................................*......................................... + // str q19, [x0, #240] // ..................................................................................................................................................*.. + // mls v11.4S, v31.4S, v8.S[0] // ........................................................................................................................*............................ + // str q13, [x0, #624] // ............................................................................................................................................*........ + // mls v4.4S, v26.4S, v8.S[0] // .............................................................................................................................*....................... + // str q30, [x0, #880] // ..............................................................................................................................................*...... + // add v5.4S, v15.4S, v11.4S // ................................................................................................................................*.................... + // sub v15.4S, v15.4S, v11.4S // ...................................................................................................................................................*. + // sub v13.4S, v18.4S, v4.4S // .........................................................................................................................................*........... + // str q5, [x0], #16 // ....................................................................................................................................................* + // add v5.4S, v18.4S, v4.4S // .................................................................................................................................................*... + + sub count, count, #2 layer123_start: - ldr q_data0, [in, #(0*(1024/8))] - ldr q_data1, [in, #(1*(1024/8))] - ldr q_data2, [in, #(2*(1024/8))] - ldr q_data3, [in, #(3*(1024/8))] - ldr q_data4, [in, #(4*(1024/8))] - ldr q_data5, [in, #(5*(1024/8))] - ldr q_data6, [in, #(6*(1024/8))] - ldr q_data7, [in, #(7*(1024/8))] - - ct_butterfly data0, data4, root0, 0, 1 - ct_butterfly data1, data5, root0, 0, 1 - ct_butterfly data2, data6, root0, 0, 1 - ct_butterfly data3, data7, root0, 0, 1 - - ct_butterfly data0, data2, root0, 2, 3 - ct_butterfly data1, data3, root0, 2, 3 - ct_butterfly data4, data6, root1, 0, 1 - ct_butterfly data5, data7, root1, 0, 1 - - ct_butterfly data0, data1, root1, 2, 3 - ct_butterfly data2, data3, root2, 0, 1 - ct_butterfly data4, data5, root2, 2, 3 - ct_butterfly data6, data7, root3, 0, 1 - - str q_data0, [in], #16 - str q_data1, [in, #(-16 + 1*(1024/8))] - str q_data2, [in, #(-16 + 2*(1024/8))] - str q_data3, [in, #(-16 + 3*(1024/8))] - str q_data4, [in, #(-16 + 4*(1024/8))] - str q_data5, [in, #(-16 + 5*(1024/8))] - str q_data6, [in, #(-16 + 6*(1024/8))] - str q_data7, [in, #(-16 + 7*(1024/8))] - - subs count, count, #1 + // Instructions: 76 + // Expected cycles: 84 + // Expected IPC: 0.90 + // + // Cycle bound: 84.0 + // IPC bound: 0.90 + // + // Wall time: 3.54s + // User time: 3.54s + // + // -------------------------------- cycle (expected) ---------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|-------- + str q7, [x0, #352] // l................................................................................... + add v21.4S, v16.4S, v29.4S // .l.................................................................................. + sub v16.4S, v24.4S, v23.4S // ..*................................................................................. + ldr q11, [x0, #256] // ...e................................................................................ + ldr q24, [x0, #768] // .....e.............................................................................. + ldr q7, [x0, #512] // .......e............................................................................ + sqrdmulh v26.4S, v24.4S, v0.S[1] // .........e.......................................................................... + mul v23.4S, v24.4S, v0.S[0] // ..........e......................................................................... + mul v18.4S, v7.4S, v0.S[0] // ...........e........................................................................ + sqrdmulh v19.4S, v7.4S, v0.S[1] // ............e....................................................................... + mul v29.4S, v22.4S, v3.S[0] // .............*...................................................................... + mls v23.4S, v26.4S, v8.S[0] // ..............e..................................................................... + ldr q7, [x0, #896] // ...............e.................................................................... + mls v18.4S, v19.4S, v8.S[0] // .................e.................................................................. + add v28.4S, v11.4S, v23.4S // ..................e................................................................. + sub v12.4S, v11.4S, v23.4S // ...................e................................................................ + sqrdmulh v30.4S, v7.4S, v0.S[1] // ....................e............................................................... + mul v26.4S, v7.4S, v0.S[0] // .....................e.............................................................. + ldr q17, [x0, #640] // ......................e............................................................. + ldr q31, [x0, #384] // ........................e........................................................... + mls v29.4S, v10.4S, v8.S[0] // ..........................*......................................................... + mls v26.4S, v30.4S, v8.S[0] // ...........................e........................................................ + mul v11.4S, v17.4S, v0.S[0] // ............................e....................................................... + sqrdmulh v25.4S, v17.4S, v0.S[1] // .............................e...................................................... + ldr q19, [x0, #128] // ..............................e..................................................... + sub v27.4S, v31.4S, v26.4S // ................................e................................................... + mls v11.4S, v25.4S, v8.S[0] // .................................e.................................................. + ldr q22, [x0, #0] // ..................................e................................................. + sqrdmulh v10.4S, v28.4S, v0.S[3] // ....................................e............................................... + mul v17.4S, v28.4S, v0.S[2] // .....................................e.............................................. + add v9.4S, v22.4S, v18.4S // ......................................e............................................. + sub v24.4S, v22.4S, v18.4S // .......................................e............................................ + sqrdmulh v4.4S, v12.4S, v1.S[1] // ........................................e........................................... + mul v23.4S, v12.4S, v1.S[0] // .........................................e.......................................... + sqrdmulh v7.4S, v27.4S, v1.S[1] // ..........................................e......................................... + add v22.4S, v31.4S, v26.4S // ...........................................e........................................ + mul v26.4S, v27.4S, v1.S[0] // ............................................e....................................... + sub v12.4S, v19.4S, v11.4S // .............................................e...................................... + mls v23.4S, v4.4S, v8.S[0] // ..............................................e..................................... + str q21, [x0, #736] // ...............................................l.................................... + mul v18.4S, v14.4S, v2.S[0] // ................................................*................................... + sqrdmulh v14.4S, v22.4S, v0.S[3] // .................................................e.................................. + mul v4.4S, v22.4S, v0.S[2] // ..................................................e................................. + mls v26.4S, v7.4S, v8.S[0] // ...................................................e................................ + mls v18.4S, v6.4S, v8.S[0] // ....................................................*............................... + add v27.4S, v19.4S, v11.4S // .....................................................e.............................. + mls v4.4S, v14.4S, v8.S[0] // ......................................................e............................. + mls v17.4S, v10.4S, v8.S[0] // .......................................................e............................ + sub v7.4S, v20.4S, v18.4S // ........................................................*........................... + sub v22.4S, v12.4S, v26.4S // .........................................................e.......................... + sub v14.4S, v27.4S, v4.4S // ..........................................................e......................... + sub v30.4S, v16.4S, v29.4S // ...........................................................*........................ + sqrdmulh v10.4S, v22.4S, v3.S[1] // ............................................................e....................... + add v19.4S, v20.4S, v18.4S // .............................................................*...................... + sqrdmulh v6.4S, v14.4S, v2.S[1] // ..............................................................e..................... + add v18.4S, v24.4S, v23.4S // ...............................................................e.................... + add v28.4S, v12.4S, v26.4S // ................................................................e................... + sub v20.4S, v9.4S, v17.4S // .................................................................e.................. + str q15, [x0, #112] // ..................................................................*................. + add v25.4S, v27.4S, v4.4S // ...................................................................e................ + add v15.4S, v9.4S, v17.4S // ....................................................................e............... + mul v4.4S, v28.4S, v2.S[2] // .....................................................................e.............. + mul v11.4S, v25.4S, v1.S[2] // ......................................................................e............. + sqrdmulh v31.4S, v25.4S, v1.S[3] // .......................................................................e............ + str q5, [x0, #496] // ........................................................................*........... + sqrdmulh v26.4S, v28.4S, v2.S[3] // .........................................................................e.......... + str q19, [x0, #240] // ..........................................................................*......... + mls v11.4S, v31.4S, v8.S[0] // ...........................................................................e........ + str q13, [x0, #624] // ............................................................................*....... + mls v4.4S, v26.4S, v8.S[0] // .............................................................................e...... + str q30, [x0, #880] // ..............................................................................*..... + add v5.4S, v15.4S, v11.4S // ...............................................................................e.... + sub v15.4S, v15.4S, v11.4S // ................................................................................e... + sub v13.4S, v18.4S, v4.4S // .................................................................................e.. + str q5, [x0], #16 // ..................................................................................e. + add v5.4S, v18.4S, v4.4S // ...................................................................................e + + // ------------------------------------------------------------------------------------------------- cycle (expected) -------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------ + // ldr q9, [x0, #(0*(1024/8))] // ...............................e.................................................'.................................~.................................................'.................................~............. + // ldr q10, [x0, #(1*(1024/8))] // ...........................e.....................................................'.............................~.....................................................'.............................~................. + // ldr q11, [x0, #(2*(1024/8))] // e................................................................................'..~................................................................................'..~............................................ + // ldr q12, [x0, #(3*(1024/8))] // .....................e...........................................................'.......................~...........................................................'.......................~....................... + // ldr q13, [x0, #(4*(1024/8))] // ....e............................................................................'......~............................................................................'......~........................................ + // ldr q14, [x0, #(5*(1024/8))] // ...................e.............................................................'.....................~.............................................................'.....................~......................... + // ldr q15, [x0, #(6*(1024/8))] // ..e..............................................................................'....~..............................................................................'....~.......................................... + // ldr q16, [x0, #(7*(1024/8))] // ............e....................................................................'..............~....................................................................'..............~................................ + // sqrdmulh v27.4s, v13.4s, v0.s[1] // .........e.......................................................................'...........~.......................................................................'...........~................................... + // mul v24.4s, v13.4s, v0.s[0] // ........e........................................................................'..........~........................................................................'..........~.................................... + // mls v24.4s, v27.4s, v8.s[0] // ..............e..................................................................'................~..................................................................'................~.............................. + // sub v13.4s, v9.4s, v24.4s // ....................................e............................................'......................................~............................................'......................................~........ + // add v9.4s, v9.4s, v24.4s // ...................................e.............................................'.....................................~.............................................'.....................................~......... + // sqrdmulh v27.4s, v14.4s, v0.s[1] // ..........................e......................................................'............................~......................................................'............................~.................. + // mul v24.4s, v14.4s, v0.s[0] // .........................e.......................................................'...........................~.......................................................'...........................~................... + // mls v24.4s, v27.4s, v8.s[0] // ..............................e..................................................'................................~..................................................'................................~.............. + // sub v14.4s, v10.4s, v24.4s // ..........................................e......................................'............................................~......................................'............................................~.. + // add v10.4s, v10.4s, v24.4s // ..................................................e..............................'....................................................~..............................'............................................... + // sqrdmulh v27.4s, v15.4s, v0.s[1] // ......e..........................................................................'........~..........................................................................'........~...................................... + // mul v24.4s, v15.4s, v0.s[0] // .......e.........................................................................'.........~.........................................................................'.........~..................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........e.....................................................................'.............~.....................................................................'.............~................................. + // sub v15.4s, v11.4s, v24.4s // ................e................................................................'..................~................................................................'..................~............................ + // add v11.4s, v11.4s, v24.4s // ...............e.................................................................'.................~.................................................................'.................~............................. + // sqrdmulh v27.4s, v16.4s, v0.s[1] // .................e...............................................................'...................~...............................................................'...................~........................... + // mul v24.4s, v16.4s, v0.s[0] // ..................e..............................................................'....................~..............................................................'....................~.......................... + // mls v24.4s, v27.4s, v8.s[0] // ........................e........................................................'..........................~........................................................'..........................~.................... + // sub v16.4s, v12.4s, v24.4s // .............................e...................................................'...............................~...................................................'...............................~............... + // add v12.4s, v12.4s, v24.4s // ........................................e........................................'..........................................~........................................'..........................................~.... + // sqrdmulh v27.4s, v11.4s, v0.s[3] // .................................e...............................................'...................................~...............................................'...................................~........... + // mul v24.4s, v11.4s, v0.s[2] // ..................................e..............................................'....................................~..............................................'....................................~.......... + // mls v24.4s, v27.4s, v8.s[0] // ....................................................e............................'......................................................~............................'............................................... + // sub v11.4s, v9.4s, v24.4s // ..............................................................e..................'................................................................~..................'............................................... + // add v9.4s, v9.4s, v24.4s // .................................................................e...............'...................................................................~...............'............................................... + // sqrdmulh v27.4s, v12.4s, v0.s[3] // ..............................................e..................................'................................................~..................................'............................................... + // mul v24.4s, v12.4s, v0.s[2] // ...............................................e.................................'.................................................~.................................'............................................... + // mls v24.4s, v27.4s, v8.s[0] // ...................................................e.............................'.....................................................~.............................'............................................... + // sub v12.4s, v10.4s, v24.4s // .......................................................e.........................'.........................................................~.........................'............................................... + // add v10.4s, v10.4s, v24.4s // ................................................................e................'..................................................................~................'............................................... + // sqrdmulh v27.4s, v15.4s, v1.s[1] // .....................................e...........................................'.......................................~...........................................'.......................................~....... + // mul v24.4s, v15.4s, v1.s[0] // ......................................e..........................................'........................................~..........................................'........................................~...... + // mls v24.4s, v27.4s, v8.s[0] // ...........................................e.....................................'.............................................~.....................................'.............................................~. + // sub v15.4s, v13.4s, v24.4s // .................................................................................'.*.................................................................................'.~............................................. + // add v13.4s, v13.4s, v24.4s // ............................................................e....................'..............................................................~....................'............................................... + // sqrdmulh v27.4s, v16.4s, v1.s[1] // .......................................e.........................................'.........................................~.........................................'.........................................~..... + // mul v24.4s, v16.4s, v1.s[0] // .........................................e.......................................'...........................................~.......................................'...........................................~... + // mls v24.4s, v27.4s, v8.s[0] // ................................................e................................'..................................................~................................'............................................... + // sub v16.4s, v14.4s, v24.4s // ......................................................e..........................'........................................................~..........................'............................................... + // add v14.4s, v14.4s, v24.4s // .............................................................e...................'...............................................................~...................'............................................... + // sqrdmulh v27.4s, v10.4s, v1.s[3] // ....................................................................e............'......................................................................~............'............................................... + // mul v24.4s, v10.4s, v1.s[2] // ...................................................................e.............'.....................................................................~.............'............................................... + // mls v24.4s, v27.4s, v8.s[0] // ........................................................................e........'..........................................................................~........'............................................... + // sub v10.4s, v9.4s, v24.4s // .............................................................................e...'...............................................................................~...'............................................... + // add v9.4s, v9.4s, v24.4s // ............................................................................e....'..............................................................................~....'............................................... + // sqrdmulh v27.4s, v12.4s, v2.s[1] // ...........................................................e.....................'.............................................................~.....................'............................................... + // mul v24.4s, v12.4s, v2.s[0] // .............................................~...................................'...............................................*...................................'............................................... + // mls v24.4s, v27.4s, v8.s[0] // .................................................~...............................'...................................................*...............................'............................................... + // sub v12.4s, v11.4s, v24.4s // .....................................................~...........................'.......................................................*...........................'............................................... + // add v11.4s, v11.4s, v24.4s // ..........................................................~......................'............................................................*......................'............................................... + // sqrdmulh v27.4s, v14.4s, v2.s[3] // ......................................................................e..........'........................................................................~..........'............................................... + // mul v24.4s, v14.4s, v2.s[2] // ..................................................................e..............'....................................................................~..............'............................................... + // mls v24.4s, v27.4s, v8.s[0] // ..........................................................................e......'............................................................................~......'............................................... + // sub v14.4s, v13.4s, v24.4s // ..............................................................................e..'................................................................................~..'............................................... + // add v13.4s, v13.4s, v24.4s // ................................................................................e'..................................................................................~'............................................... + // sqrdmulh v27.4s, v16.4s, v3.s[1] // .........................................................e.......................'...........................................................~.......................'............................................... + // mul v24.4s, v16.4s, v3.s[0] // ..........~......................................................................'............*......................................................................'............~.................................. + // mls v24.4s, v27.4s, v8.s[0] // .......................~.........................................................'.........................*.........................................................'.........................~..................... + // sub v16.4s, v15.4s, v24.4s // ........................................................~........................'..........................................................*........................'............................................... + // add v15.4s, v15.4s, v24.4s // .................................................................................'~..................................................................................'l.............................................. + // str q9, [x0], #16 // ...............................................................................e.'.................................................................................~.'............................................... + // str q10, [x0, #(-16 + 1*(1024/8))] // ...............................................................~.................'.................................................................*.................'............................................... + // str q11, [x0, #(-16 + 2*(1024/8))] // .......................................................................~.........'.........................................................................*.........'............................................... + // str q12, [x0, #(-16 + 3*(1024/8))] // .................................................................................~...................................................................................l............................................... + // str q13, [x0, #(-16 + 4*(1024/8))] // .....................................................................~...........'.......................................................................*...........'............................................... + // str q14, [x0, #(-16 + 5*(1024/8))] // .........................................................................~.......'...........................................................................*.......'............................................... + // str q15, [x0, #(-16 + 6*(1024/8))] // ............................................~....................................'..............................................~....................................'..............................................l + // str q16, [x0, #(-16 + 7*(1024/8))] // ...........................................................................~.....'.............................................................................*.....'............................................... + + sub count, count, 1 cbnz count, layer123_start + // Instructions: 19 + // Expected cycles: 20 + // Expected IPC: 0.95 + // + // Cycle bound: 20.0 + // IPC bound: 0.95 + // + // Wall time: 0.07s + // User time: 0.07s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + mul v26.4S, v14.4S, v2.S[0] // *............................. + str q13, [x0, #624] // .*............................ + mul v13.4S, v22.4S, v3.S[0] // ..*........................... + str q15, [x0, #112] // ...*.......................... + mls v26.4S, v6.4S, v8.S[0] // ....*......................... + str q7, [x0, #352] // .....*........................ + mls v13.4S, v10.4S, v8.S[0] // ......*....................... + sub v11.4S, v24.4S, v23.4S // .......*...................... + add v14.4S, v16.4S, v29.4S // ........*..................... + str q5, [x0, #496] // .........*.................... + add v19.4S, v11.4S, v13.4S // ..........*................... + str q14, [x0, #736] // ...........*.................. + sub v5.4S, v11.4S, v13.4S // ............*................. + str q19, [x0, #752] // .............*................ + sub v10.4S, v20.4S, v26.4S // ..............*............... + str q5, [x0, #880] // ...............*.............. + add v22.4S, v20.4S, v26.4S // ................*............. + str q10, [x0, #368] // .................*............ + str q22, [x0, #240] // ...................*.......... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // str q7, [x0, #352] // .....*......................... + // add v21.4S, v16.4S, v29.4S // ........*...................... + // sub v16.4S, v24.4S, v23.4S // .......*....................... + // mul v29.4S, v22.4S, v3.S[0] // ..*............................ + // mls v29.4S, v10.4S, v8.S[0] // ......*........................ + // str q21, [x0, #736] // ...........*................... + // mul v18.4S, v14.4S, v2.S[0] // *.............................. + // mls v18.4S, v6.4S, v8.S[0] // ....*.......................... + // sub v7.4S, v20.4S, v18.4S // ..............*................ + // sub v30.4S, v16.4S, v29.4S // ............*.................. + // add v19.4S, v20.4S, v18.4S // ................*.............. + // str q15, [x0, #112] // ...*........................... + // str q5, [x0, #496] // .........*..................... + // str q19, [x0, #240] // ...................*........... + // str q13, [x0, #624] // .*............................. + // str q30, [x0, #880] // ...............*............... + // str q7, [x0, #368] // .................*............. + // add v21.4S, v16.4S, v29.4S // ..........*.................... + // str q21, [x0, #752] // .............*................. + mov in, inp add inpp, in, #64 @@ -236,7 +714,7 @@ layer123_start: // Use two data pointers and carefully arrange // increments to facilitate reordering of loads // and stores by SLOTHY. - // + // TODO: Think of alternatives here -- the start with `in` // pointing to 64 byte below the actual data, which in theory // could underflow. It's unclear how the CPU would behave in this case. @@ -244,60 +722,640 @@ layer123_start: sub inpp, inpp, #64 .p2align 2 + // Instructions: 143 + // Expected cycles: 165 + // Expected IPC: 0.87 + // + // Cycle bound: 165.0 + // IPC bound: 0.87 + // + // Wall time: 2.05s + // User time: 2.05s + // + // ------------------------------------------------------------------------- cycle (expected) -------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|-------------- + ldr q4, [x0, #112] // *.................................................................................................................................................................... + ldr q23, [x0, #80] // ..*.................................................................................................................................................................. + ldr q20, [x0, #64] // ....*................................................................................................................................................................ + ldr q6, [x0, #96] // ......*.............................................................................................................................................................. + add x0, x0, #64 // ......*.............................................................................................................................................................. + ldr q3, [x5, #64] // ........*............................................................................................................................................................ + ldr q29, [x5, #96] // ..........*.......................................................................................................................................................... + ldr q10, [x5, #80] // ............*........................................................................................................................................................ + ldr q15, [x5, #112] // ..............*...................................................................................................................................................... + add x5, x5, #64 // ..............*...................................................................................................................................................... + ldr q5, [x1], #64 // ................*.................................................................................................................................................... + ldr q18, [x1, #-48] // ..................*.................................................................................................................................................. + mul v19.4S, v15.4S, v5.S[0] // ....................*................................................................................................................................................ + sqrdmulh v12.4S, v15.4S, v5.S[1] // .....................*............................................................................................................................................... + sqrdmulh v26.4S, v10.4S, v5.S[1] // ......................*.............................................................................................................................................. + mul v0.4S, v10.4S, v5.S[0] // .......................*............................................................................................................................................. + mul v21.4S, v3.4S, v5.S[0] // ........................*............................................................................................................................................ + mls v19.4S, v12.4S, v8.S[0] // .........................*........................................................................................................................................... + sqrdmulh v3.4S, v3.4S, v5.S[1] // ..........................*.......................................................................................................................................... + mls v0.4S, v26.4S, v8.S[0] // ...........................*......................................................................................................................................... + mul v9.4S, v29.4S, v5.S[0] // ............................*........................................................................................................................................ + add v11.4S, v4.4S, v19.4S // .............................*....................................................................................................................................... + sub v25.4S, v4.4S, v19.4S // ..............................*...................................................................................................................................... + add v13.4S, v23.4S, v0.4S // ...............................*..................................................................................................................................... + sub v17.4S, v23.4S, v0.4S // ................................*.................................................................................................................................... + mls v21.4S, v3.4S, v8.S[0] // .................................*................................................................................................................................... + sqrdmulh v27.4S, v29.4S, v5.S[1] // ..................................*.................................................................................................................................. + sqrdmulh v14.4S, v11.4S, v5.S[3] // ...................................*................................................................................................................................. + mul v0.4S, v11.4S, v5.S[2] // ....................................*................................................................................................................................ + sub v10.4S, v20.4S, v21.4S // .....................................*............................................................................................................................... + add v29.4S, v20.4S, v21.4S // ......................................*.............................................................................................................................. + mls v9.4S, v27.4S, v8.S[0] // .......................................*............................................................................................................................. + mls v0.4S, v14.4S, v8.S[0] // ........................................*............................................................................................................................ + mul v28.4S, v25.4S, v18.S[0] // .........................................*........................................................................................................................... + sqrdmulh v7.4S, v25.4S, v18.S[1] // ..........................................*.......................................................................................................................... + add v1.4S, v6.4S, v9.4S // ...........................................*......................................................................................................................... + sub v21.4S, v6.4S, v9.4S // ............................................*........................................................................................................................ + add v20.4S, v13.4S, v0.4S // .............................................*....................................................................................................................... + mul v23.4S, v1.4S, v5.S[2] // ..............................................*...................................................................................................................... + sqrdmulh v24.4S, v1.4S, v5.S[3] // ...............................................*..................................................................................................................... + sqrdmulh v12.4S, v21.4S, v18.S[1] // ................................................*.................................................................................................................... + mul v15.4S, v21.4S, v18.S[0] // .................................................*................................................................................................................... + sqrdmulh v26.4S, v20.4S, v18.S[3] // ..................................................*.................................................................................................................. + mul v2.4S, v20.4S, v18.S[2] // ...................................................*................................................................................................................. + ldr q20, [x1, #-32] // ....................................................*................................................................................................................ + mls v28.4S, v7.4S, v8.S[0] // ......................................................*.............................................................................................................. + sub v9.4S, v13.4S, v0.4S // .......................................................*............................................................................................................. + ldr q31, [x1, #-16] // ........................................................*............................................................................................................ + add v30.4S, v17.4S, v28.4S // ..........................................................*.......................................................................................................... + sub v28.4S, v17.4S, v28.4S // ...........................................................*......................................................................................................... + mls v23.4S, v24.4S, v8.S[0] // ............................................................*........................................................................................................ + sqrdmulh v25.4S, v9.4S, v20.S[1] // .............................................................*....................................................................................................... + mul v1.4S, v9.4S, v20.S[0] // ..............................................................*...................................................................................................... + mls v15.4S, v12.4S, v8.S[0] // ...............................................................*..................................................................................................... + sqrdmulh v5.4S, v28.4S, v31.S[1] // ................................................................*.................................................................................................... + mul v3.4S, v30.4S, v20.S[2] // .................................................................*................................................................................................... + mul v14.4S, v28.4S, v31.S[0] // ..................................................................*.................................................................................................. + sqrdmulh v20.4S, v30.4S, v20.S[3] // ...................................................................*................................................................................................. + mls v2.4S, v26.4S, v8.S[0] // ....................................................................*................................................................................................ + mls v1.4S, v25.4S, v8.S[0] // .....................................................................*............................................................................................... + sub v9.4S, v29.4S, v23.4S // ......................................................................*.............................................................................................. + add v12.4S, v29.4S, v23.4S // .......................................................................*............................................................................................. + mls v14.4S, v5.4S, v8.S[0] // ........................................................................*............................................................................................ + mls v3.4S, v20.4S, v8.S[0] // .........................................................................*........................................................................................... + add v17.4S, v10.4S, v15.4S // ..........................................................................*.......................................................................................... + sub v23.4S, v10.4S, v15.4S // ...........................................................................*......................................................................................... + add v31.4S, v12.4S, v2.4S // ............................................................................*........................................................................................ + sub v12.4S, v12.4S, v2.4S // .............................................................................*....................................................................................... + add v6.4S, v9.4S, v1.4S // ..............................................................................*...................................................................................... + sub v27.4S, v9.4S, v1.4S // ...............................................................................*..................................................................................... + sub v2.4S, v17.4S, v3.4S // ................................................................................*.................................................................................... + sub v15.4S, v23.4S, v14.4S // .................................................................................*................................................................................... + add v7.4S, v17.4S, v3.4S // ..................................................................................*.................................................................................. + add v0.4S, v23.4S, v14.4S // ...................................................................................*................................................................................. + trn2 v23.4S, v6.4S, v27.4S // ....................................................................................*................................................................................ + trn2 v3.4S, v31.4S, v12.4S // .....................................................................................*............................................................................... + trn2 v30.4S, v7.4S, v2.4S // ......................................................................................*.............................................................................. + trn2 v11.4S, v0.4S, v15.4S // .......................................................................................*............................................................................. + ldr q13, [x2], #(12*16) // ........................................................................................*............................................................................ + ldr q19, [x2, #-176] // ..........................................................................................*.......................................................................... + trn2 v22.2D, v3.2D, v23.2D // ............................................................................................*........................................................................ + ldr q18, [x2, #-80] // .............................................................................................*....................................................................... + ldr q25, [x2, #-96] // ...............................................................................................*..................................................................... + trn2 v1.2D, v30.2D, v11.2D // .................................................................................................*................................................................... + sqrdmulh v5.4S, v22.4S, v19.4S // ..................................................................................................*.................................................................. + mul v17.4S, v22.4S, v13.4S // ...................................................................................................*................................................................. + mul v10.4S, v1.4S, v25.4S // ....................................................................................................*................................................................ + sqrdmulh v16.4S, v1.4S, v18.4S // .....................................................................................................*............................................................... + trn1 v20.4S, v6.4S, v27.4S // ......................................................................................................*.............................................................. + trn1 v12.4S, v31.4S, v12.4S // .......................................................................................................*............................................................. + mls v17.4S, v5.4S, v8.S[0] // ........................................................................................................*............................................................ + trn1 v31.4S, v7.4S, v2.4S // .........................................................................................................*........................................................... + trn1 v0.4S, v0.4S, v15.4S // ..........................................................................................................*.......................................................... + trn2 v29.2D, v12.2D, v20.2D // ...........................................................................................................*......................................................... + mls v10.4S, v16.4S, v8.S[0] // ............................................................................................................*........................................................ + trn1 v15.2D, v3.2D, v23.2D // .............................................................................................................*....................................................... + trn2 v22.2D, v31.2D, v0.2D // ..............................................................................................................*...................................................... + trn1 v4.2D, v30.2D, v11.2D // ...............................................................................................................*..................................................... + mul v16.4S, v29.4S, v13.4S // ................................................................................................................*.................................................... + sqrdmulh v9.4S, v29.4S, v19.4S // .................................................................................................................*................................................... + add v2.4S, v15.4S, v17.4S // ..................................................................................................................*.................................................. + sub v21.4S, v15.4S, v17.4S // ...................................................................................................................*................................................. + ldr q5, [x2, #-160] // ....................................................................................................................*................................................ + ldr q13, [x2, #-112] // ......................................................................................................................*.............................................. + ldr q7, [x2, #-144] // ........................................................................................................................*............................................ + ldr q26, [x2, #-128] // ..........................................................................................................................*.......................................... + sqrdmulh v19.4S, v22.4S, v18.4S // ............................................................................................................................*........................................ + mul v18.4S, v22.4S, v25.4S // .............................................................................................................................*....................................... + add v3.4S, v4.4S, v10.4S // ..............................................................................................................................*...................................... + sub v11.4S, v4.4S, v10.4S // ...............................................................................................................................*..................................... + ldr q30, [x2, #-32] // ................................................................................................................................*.................................... + ldr q28, [x2, #-64] // ..................................................................................................................................*.................................. + ldr q1, [x2, #-16] // ....................................................................................................................................*................................ + ldr q10, [x2, #-48] // ......................................................................................................................................*.............................. + mls v16.4S, v9.4S, v8.S[0] // ........................................................................................................................................*............................ + sqrdmulh v22.4S, v2.4S, v7.4S // .........................................................................................................................................*........................... + sqrdmulh v7.4S, v21.4S, v13.4S // ..........................................................................................................................................*.......................... + mul v24.4S, v21.4S, v26.4S // ...........................................................................................................................................*......................... + mul v5.4S, v2.4S, v5.4S // ............................................................................................................................................*........................ + trn1 v2.2D, v12.2D, v20.2D // .............................................................................................................................................*....................... + mls v18.4S, v19.4S, v8.S[0] // ..............................................................................................................................................*...................... + mul v4.4S, v11.4S, v30.4S // ...............................................................................................................................................*..................... + sqrdmulh v15.4S, v11.4S, v1.4S // ................................................................................................................................................*.................... + mul v6.4S, v3.4S, v28.4S // .................................................................................................................................................*................... + sqrdmulh v13.4S, v3.4S, v10.4S // ..................................................................................................................................................*.................. + trn1 v0.2D, v31.2D, v0.2D // ...................................................................................................................................................*................. + mls v5.4S, v22.4S, v8.S[0] // ....................................................................................................................................................*................ + mls v24.4S, v7.4S, v8.S[0] // .....................................................................................................................................................*............... + sub v26.4S, v2.4S, v16.4S // ......................................................................................................................................................*.............. + add v2.4S, v2.4S, v16.4S // .......................................................................................................................................................*............. + mls v4.4S, v15.4S, v8.S[0] // ........................................................................................................................................................*............ + mls v6.4S, v13.4S, v8.S[0] // .........................................................................................................................................................*........... + sub v20.4S, v0.4S, v18.4S // ..........................................................................................................................................................*.......... + add v30.4S, v0.4S, v18.4S // ...........................................................................................................................................................*......... + add v17.4S, v26.4S, v24.4S // ............................................................................................................................................................*........ + sub v18.4S, v26.4S, v24.4S // .............................................................................................................................................................*....... + add v15.4S, v2.4S, v5.4S // ..............................................................................................................................................................*...... + sub v16.4S, v2.4S, v5.4S // ...............................................................................................................................................................*..... + sub v26.4S, v20.4S, v4.4S // ................................................................................................................................................................*.... + add v23.4S, v30.4S, v6.4S // .................................................................................................................................................................*... + add v25.4S, v20.4S, v4.4S // ..................................................................................................................................................................*.. + sub v24.4S, v30.4S, v6.4S // ...................................................................................................................................................................*. + st4 {v15.4S, v16.4S, v17.4S, v18.4S}, [x0], #64 // ....................................................................................................................................................................* + + // ------------------------------------------------------------------------- cycle (expected) -------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|-------------- + // ldr q22, [x0, #112] // *.................................................................................................................................................................... + // ldr q28, [x0, #64] // ....*................................................................................................................................................................ + // ldr q13, [x5, #64] // ........*............................................................................................................................................................ + // ldr q3, [x0, #80] // ..*.................................................................................................................................................................. + // ldr q14, [x0, #96] // ......*.............................................................................................................................................................. + // add x0, x0, #64 // ......*.............................................................................................................................................................. + // ldr q17, [x1], #64 // ................*.................................................................................................................................................... + // ldr q26, [x5, #112] // ..............*...................................................................................................................................................... + // sqrdmulh v1.4S, v13.4S, v17.S[1] // ..........................*.......................................................................................................................................... + // mul v25.4S, v13.4S, v17.S[0] // ........................*............................................................................................................................................ + // sqrdmulh v19.4S, v26.4S, v17.S[1] // .....................*............................................................................................................................................... + // mul v10.4S, v26.4S, v17.S[0] // ....................*................................................................................................................................................ + // ldr q24, [x5, #80] // ............*........................................................................................................................................................ + // mls v25.4S, v1.4S, v8.S[0] // .................................*................................................................................................................................... + // mls v10.4S, v19.4S, v8.S[0] // .........................*........................................................................................................................................... + // mul v9.4S, v24.4S, v17.S[0] // .......................*............................................................................................................................................. + // sqrdmulh v16.4S, v24.4S, v17.S[1] // ......................*.............................................................................................................................................. + // sub v21.4S, v28.4S, v25.4S // .....................................*............................................................................................................................... + // sub v0.4S, v22.4S, v10.4S // ..............................*...................................................................................................................................... + // add v13.4S, v22.4S, v10.4S // .............................*....................................................................................................................................... + // add v25.4S, v28.4S, v25.4S // ......................................*.............................................................................................................................. + // mls v9.4S, v16.4S, v8.S[0] // ...........................*......................................................................................................................................... + // ldr q29, [x5, #96] // ..........*.......................................................................................................................................................... + // add x5, x5, #64 // ..............*...................................................................................................................................................... + // mul v18.4S, v13.4S, v17.S[2] // ....................................*................................................................................................................................ + // add v24.4S, v3.4S, v9.4S // ...............................*..................................................................................................................................... + // sub v28.4S, v3.4S, v9.4S // ................................*.................................................................................................................................... + // sqrdmulh v22.4S, v29.4S, v17.S[1] // ..................................*.................................................................................................................................. + // mul v29.4S, v29.4S, v17.S[0] // ............................*........................................................................................................................................ + // ldr q2, [x1, #-16] // ........................................................*............................................................................................................ + // sqrdmulh v23.4S, v13.4S, v17.S[3] // ...................................*................................................................................................................................. + // mls v29.4S, v22.4S, v8.S[0] // .......................................*............................................................................................................................. + // ldr q20, [x1, #-48] // ..................*.................................................................................................................................................. + // mls v18.4S, v23.4S, v8.S[0] // ........................................*............................................................................................................................ + // add v5.4S, v14.4S, v29.4S // ...........................................*......................................................................................................................... + // sub v29.4S, v14.4S, v29.4S // ............................................*........................................................................................................................ + // mul v12.4S, v0.4S, v20.S[0] // .........................................*........................................................................................................................... + // mul v30.4S, v5.4S, v17.S[2] // ..............................................*...................................................................................................................... + // sqrdmulh v14.4S, v5.4S, v17.S[3] // ...............................................*..................................................................................................................... + // sqrdmulh v19.4S, v0.4S, v20.S[1] // ..........................................*.......................................................................................................................... + // add v31.4S, v24.4S, v18.4S // .............................................*....................................................................................................................... + // mul v7.4S, v29.4S, v20.S[0] // .................................................*................................................................................................................... + // mls v30.4S, v14.4S, v8.S[0] // ............................................................*........................................................................................................ + // sub v15.4S, v24.4S, v18.4S // .......................................................*............................................................................................................. + // ldr q24, [x1, #-32] // ....................................................*................................................................................................................ + // sqrdmulh v5.4S, v29.4S, v20.S[1] // ................................................*.................................................................................................................... + // mul v1.4S, v31.4S, v20.S[2] // ...................................................*................................................................................................................. + // mls v12.4S, v19.4S, v8.S[0] // ......................................................*.............................................................................................................. + // sqrdmulh v18.4S, v15.4S, v24.S[1] // .............................................................*....................................................................................................... + // mls v7.4S, v5.4S, v8.S[0] // ...............................................................*..................................................................................................... + // sqrdmulh v0.4S, v31.4S, v20.S[3] // ..................................................*.................................................................................................................. + // sub v10.4S, v28.4S, v12.4S // ...........................................................*......................................................................................................... + // sub v27.4S, v25.4S, v30.4S // ......................................................................*.............................................................................................. + // sub v31.4S, v21.4S, v7.4S // ...........................................................................*......................................................................................... + // add v23.4S, v21.4S, v7.4S // ..........................................................................*.......................................................................................... + // add v7.4S, v25.4S, v30.4S // .......................................................................*............................................................................................. + // mls v1.4S, v0.4S, v8.S[0] // ....................................................................*................................................................................................ + // mul v6.4S, v10.4S, v2.S[0] // ..................................................................*.................................................................................................. + // ldr q17, [x2, #16] // ..........................................................................................*.......................................................................... + // add v29.4S, v7.4S, v1.4S // ............................................................................*........................................................................................ + // add v21.4S, v28.4S, v12.4S // ..........................................................*.......................................................................................................... + // ldr q12, [x2, #112] // .............................................................................................*....................................................................... + // mul v9.4S, v21.4S, v24.S[2] // .................................................................*................................................................................................... + // ldr q5, [x2], #(12*16) // ........................................................................................*............................................................................ + // sqrdmulh v20.4S, v10.4S, v2.S[1] // ................................................................*.................................................................................................... + // sub v13.4S, v7.4S, v1.4S // .............................................................................*....................................................................................... + // sqrdmulh v2.4S, v21.4S, v24.S[3] // ...................................................................*................................................................................................. + // ldr q14, [x2, #-96] // ...............................................................................................*..................................................................... + // mul v19.4S, v15.4S, v24.S[0] // ..............................................................*...................................................................................................... + // trn2 v30.4S, v29.4S, v13.4S // .....................................................................................*............................................................................... + // ldr q28, [x2, #-144] // ........................................................................................................................*............................................ + // mls v9.4S, v2.4S, v8.S[0] // .........................................................................*........................................................................................... + // mls v6.4S, v20.4S, v8.S[0] // ........................................................................*............................................................................................ + // mls v19.4S, v18.4S, v8.S[0] // .....................................................................*............................................................................................... + // trn1 v22.4S, v29.4S, v13.4S // .......................................................................................................*............................................................. + // sub v1.4S, v23.4S, v9.4S // ................................................................................*.................................................................................... + // ldr q15, [x2, #-160] // ....................................................................................................................*................................................ + // sub v0.4S, v27.4S, v19.4S // ...............................................................................*..................................................................................... + // sub v3.4S, v31.4S, v6.4S // .................................................................................*................................................................................... + // add v29.4S, v31.4S, v6.4S // ...................................................................................*................................................................................. + // ldr q7, [x2, #-112] // ......................................................................................................................*.............................................. + // trn1 v20.4S, v29.4S, v3.4S // ..........................................................................................................*.......................................................... + // add v26.4S, v27.4S, v19.4S // ..............................................................................*...................................................................................... + // add v21.4S, v23.4S, v9.4S // ..................................................................................*.................................................................................. + // ldr q31, [x2, #-32] // ................................................................................................................................*.................................... + // trn2 v10.4S, v26.4S, v0.4S // ....................................................................................*................................................................................ + // trn1 v24.4S, v26.4S, v0.4S // ......................................................................................................*.............................................................. + // trn1 v6.4S, v21.4S, v1.4S // .........................................................................................................*........................................................... + // ldr q4, [x2, #-128] // ..........................................................................................................................*.......................................... + // trn2 v25.2D, v22.2D, v24.2D // ...........................................................................................................*......................................................... + // trn1 v19.2D, v30.2D, v10.2D // .............................................................................................................*....................................................... + // trn2 v13.2D, v30.2D, v10.2D // ............................................................................................*........................................................................ + // mul v9.4S, v25.4S, v5.4S // ................................................................................................................*.................................................... + // trn2 v10.4S, v21.4S, v1.4S // ......................................................................................*.............................................................................. + // trn2 v23.4S, v29.4S, v3.4S // .......................................................................................*............................................................................. + // mul v16.4S, v13.4S, v5.4S // ...................................................................................................*................................................................. + // trn2 v0.2D, v6.2D, v20.2D // ..............................................................................................................*...................................................... + // trn2 v1.2D, v10.2D, v23.2D // .................................................................................................*................................................................... + // sqrdmulh v29.4S, v13.4S, v17.4S // ..................................................................................................*.................................................................. + // trn1 v11.2D, v10.2D, v23.2D // ...............................................................................................................*..................................................... + // mul v30.4S, v1.4S, v14.4S // ....................................................................................................*................................................................ + // sqrdmulh v21.4S, v1.4S, v12.4S // .....................................................................................................*............................................................... + // mul v1.4S, v0.4S, v14.4S // .............................................................................................................................*....................................... + // ldr q10, [x2, #-16] // ....................................................................................................................................*................................ + // mls v16.4S, v29.4S, v8.S[0] // ........................................................................................................*............................................................ + // ldr q3, [x2, #-48] // ......................................................................................................................................*.............................. + // sqrdmulh v29.4S, v0.4S, v12.4S // ............................................................................................................................*........................................ + // sub v12.4S, v19.4S, v16.4S // ...................................................................................................................*................................................. + // sqrdmulh v26.4S, v25.4S, v17.4S // .................................................................................................................*................................................... + // trn1 v5.2D, v6.2D, v20.2D // ...................................................................................................................................................*................. + // mls v30.4S, v21.4S, v8.S[0] // ............................................................................................................*........................................................ + // sqrdmulh v0.4S, v12.4S, v7.4S // ..........................................................................................................................................*.......................... + // add v16.4S, v19.4S, v16.4S // ..................................................................................................................*.................................................. + // mul v17.4S, v12.4S, v4.4S // ...........................................................................................................................................*......................... + // trn1 v12.2D, v22.2D, v24.2D // .............................................................................................................................................*....................... + // mul v23.4S, v16.4S, v15.4S // ............................................................................................................................................*........................ + // sub v21.4S, v11.4S, v30.4S // ...............................................................................................................................*..................................... + // mls v9.4S, v26.4S, v8.S[0] // ........................................................................................................................................*............................ + // add v18.4S, v11.4S, v30.4S // ..............................................................................................................................*...................................... + // mul v20.4S, v21.4S, v31.4S // ...............................................................................................................................................*..................... + // sqrdmulh v27.4S, v16.4S, v28.4S // .........................................................................................................................................*........................... + // mls v1.4S, v29.4S, v8.S[0] // ..............................................................................................................................................*...................... + // ldr q15, [x2, #-64] // ..................................................................................................................................*.................................. + // sqrdmulh v30.4S, v21.4S, v10.4S // ................................................................................................................................................*.................... + // mls v17.4S, v0.4S, v8.S[0] // .....................................................................................................................................................*............... + // add v22.4S, v12.4S, v9.4S // .......................................................................................................................................................*............. + // sub v12.4S, v12.4S, v9.4S // ......................................................................................................................................................*.............. + // sub v9.4S, v5.4S, v1.4S // ..........................................................................................................................................................*.......... + // mls v20.4S, v30.4S, v8.S[0] // ........................................................................................................................................................*............ + // add v7.4S, v5.4S, v1.4S // ...........................................................................................................................................................*......... + // sqrdmulh v4.4S, v18.4S, v3.4S // ..................................................................................................................................................*.................. + // sub v6.4S, v12.4S, v17.4S // .............................................................................................................................................................*....... + // mul v14.4S, v18.4S, v15.4S // .................................................................................................................................................*................... + // sub v26.4S, v9.4S, v20.4S // ................................................................................................................................................................*.... + // mls v23.4S, v27.4S, v8.S[0] // ....................................................................................................................................................*................ + // add v5.4S, v12.4S, v17.4S // ............................................................................................................................................................*........ + // mls v14.4S, v4.4S, v8.S[0] // .........................................................................................................................................................*........... + // add v25.4S, v9.4S, v20.4S // ..................................................................................................................................................................*.. + // add v3.4S, v22.4S, v23.4S // ..............................................................................................................................................................*...... + // sub v4.4S, v22.4S, v23.4S // ...............................................................................................................................................................*..... + // add v23.4S, v7.4S, v14.4S // .................................................................................................................................................................*... + // sub v24.4S, v7.4S, v14.4S // ...................................................................................................................................................................*. + // st4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x0], #64 // ....................................................................................................................................................................* + + sub count, count, #1 layer45678_start: - ldr q_data0, [in, #(64 + 16*0)] - ldr q_data1, [in, #(64 + 16*1)] - ldr q_data2, [in, #(64 + 16*2)] - ldr q_data3, [in, #(64 + 16*3)] - ldr q_data4, [inpp, #(64 + 16*0)] - ldr q_data5, [inpp, #(64 + 16*1)] - ldr q_data6, [inpp, #(64 + 16*2)] - ldr q_data7, [inpp, #(64 + 16*3)] - - add in, in, #64 - add inpp, inpp, #64 - - load_roots_456 - - ct_butterfly data0, data4, root0, 0, 1 - ct_butterfly data1, data5, root0, 0, 1 - ct_butterfly data2, data6, root0, 0, 1 - ct_butterfly data3, data7, root0, 0, 1 - - ct_butterfly data0, data2, root0, 2, 3 - ct_butterfly data1, data3, root0, 2, 3 - ct_butterfly data4, data6, root1, 0, 1 - ct_butterfly data5, data7, root1, 0, 1 - - ct_butterfly data0, data1, root1, 2, 3 - ct_butterfly data2, data3, root2, 0, 1 - ct_butterfly data4, data5, root2, 2, 3 - ct_butterfly data6, data7, root3, 0, 1 - - // Transpose using trn - transpose4 data0, data1, data2, data3 - transpose4 data4, data5, data6, data7 - - load_roots_78_part1 - - ct_butterfly_v data0, data2, root0, root0_tw - ct_butterfly_v data1, data3, root0, root0_tw - ct_butterfly_v data0, data1, root1, root1_tw - ct_butterfly_v data2, data3, root2, root2_tw - - load_roots_78_part2 - - ct_butterfly_v data4, data6, root0, root0_tw - ct_butterfly_v data5, data7, root0, root0_tw - ct_butterfly_v data4, data5, root1, root1_tw - ct_butterfly_v data6, data7, root2, root2_tw - - // Transpose as part of st4 - st4 {data0.4S, data1.4S, data2.4S, data3.4S}, [in], #64 - st4 {data4.4S, data5.4S, data6.4S, data7.4S}, [inpp], #64 - - subs count, count, #1 + // Instructions: 144 + // Expected cycles: 174 + // Expected IPC: 0.83 + // + // Cycle bound: 172.0 + // IPC bound: 0.84 + // + // Wall time: 120.25s + // User time: 120.25s + // + // ----------------------------------------------------------------------------- cycle (expected) ------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------------- + st4 {v23.4S, v24.4S, v25.4S, v26.4S}, [x5], #64 // *............................................................................................................................................................................. + ldr q22, [x0, #112] // .....e........................................................................................................................................................................ + ldr q28, [x0, #64] // .......e...................................................................................................................................................................... + ldr q13, [x5, #64] // .........e.................................................................................................................................................................... + ldr q3, [x0, #80] // ...........e.................................................................................................................................................................. + ldr q14, [x0, #96] // .............e................................................................................................................................................................ + add x0, x0, #64 // .............e................................................................................................................................................................ + ldr q17, [x1], #64 // ...............e.............................................................................................................................................................. + ldr q26, [x5, #112] // .................e............................................................................................................................................................ + sqrdmulh v1.4S, v13.4S, v17.S[1] // ...................e.......................................................................................................................................................... + mul v25.4S, v13.4S, v17.S[0] // ....................e......................................................................................................................................................... + sqrdmulh v19.4S, v26.4S, v17.S[1] // .....................e........................................................................................................................................................ + mul v10.4S, v26.4S, v17.S[0] // ......................e....................................................................................................................................................... + ldr q24, [x5, #80] // .......................e...................................................................................................................................................... + mls v25.4S, v1.4S, v8.S[0] // .........................e.................................................................................................................................................... + mls v10.4S, v19.4S, v8.S[0] // ..........................e................................................................................................................................................... + mul v9.4S, v24.4S, v17.S[0] // ...........................e.................................................................................................................................................. + sqrdmulh v16.4S, v24.4S, v17.S[1] // ............................e................................................................................................................................................. + sub v21.4S, v28.4S, v25.4S // .............................e................................................................................................................................................ + sub v0.4S, v22.4S, v10.4S // ..............................e............................................................................................................................................... + add v13.4S, v22.4S, v10.4S // ...............................e.............................................................................................................................................. + add v25.4S, v28.4S, v25.4S // ................................e............................................................................................................................................. + mls v9.4S, v16.4S, v8.S[0] // .................................e............................................................................................................................................ + ldr q29, [x5, #96] // ..................................e........................................................................................................................................... + add x5, x5, #64 // ..................................e........................................................................................................................................... + mul v18.4S, v13.4S, v17.S[2] // ....................................e......................................................................................................................................... + add v24.4S, v3.4S, v9.4S // .....................................e........................................................................................................................................ + sub v28.4S, v3.4S, v9.4S // ......................................e....................................................................................................................................... + sqrdmulh v22.4S, v29.4S, v17.S[1] // .......................................e...................................................................................................................................... + mul v29.4S, v29.4S, v17.S[0] // ........................................e..................................................................................................................................... + ldr q2, [x1, #-16] // .........................................e.................................................................................................................................... + sqrdmulh v23.4S, v13.4S, v17.S[3] // ...........................................e.................................................................................................................................. + mls v29.4S, v22.4S, v8.S[0] // ............................................e................................................................................................................................. + ldr q20, [x1, #-48] // .............................................e................................................................................................................................ + mls v18.4S, v23.4S, v8.S[0] // ...............................................e.............................................................................................................................. + add v5.4S, v14.4S, v29.4S // ................................................e............................................................................................................................. + sub v29.4S, v14.4S, v29.4S // .................................................e............................................................................................................................ + mul v12.4S, v0.4S, v20.S[0] // ..................................................e........................................................................................................................... + mul v30.4S, v5.4S, v17.S[2] // ...................................................e.......................................................................................................................... + sqrdmulh v14.4S, v5.4S, v17.S[3] // ....................................................e......................................................................................................................... + sqrdmulh v19.4S, v0.4S, v20.S[1] // .....................................................e........................................................................................................................ + add v31.4S, v24.4S, v18.4S // ......................................................e....................................................................................................................... + mul v7.4S, v29.4S, v20.S[0] // .......................................................e...................................................................................................................... + mls v30.4S, v14.4S, v8.S[0] // ........................................................e..................................................................................................................... + sub v15.4S, v24.4S, v18.4S // .........................................................e.................................................................................................................... + ldr q24, [x1, #-32] // ..........................................................e................................................................................................................... + sqrdmulh v5.4S, v29.4S, v20.S[1] // ............................................................e................................................................................................................. + mul v1.4S, v31.4S, v20.S[2] // .............................................................e................................................................................................................ + mls v12.4S, v19.4S, v8.S[0] // ..............................................................e............................................................................................................... + sqrdmulh v18.4S, v15.4S, v24.S[1] // ...............................................................e.............................................................................................................. + mls v7.4S, v5.4S, v8.S[0] // ................................................................e............................................................................................................. + sqrdmulh v0.4S, v31.4S, v20.S[3] // .................................................................e............................................................................................................ + sub v10.4S, v28.4S, v12.4S // ..................................................................e........................................................................................................... + sub v27.4S, v25.4S, v30.4S // ...................................................................e.......................................................................................................... + sub v31.4S, v21.4S, v7.4S // ....................................................................e......................................................................................................... + add v23.4S, v21.4S, v7.4S // .....................................................................e........................................................................................................ + add v7.4S, v25.4S, v30.4S // ......................................................................e....................................................................................................... + mls v1.4S, v0.4S, v8.S[0] // .......................................................................e...................................................................................................... + mul v6.4S, v10.4S, v2.S[0] // ........................................................................e..................................................................................................... + ldr q17, [x2, #16] // .........................................................................e.................................................................................................... + add v29.4S, v7.4S, v1.4S // ...........................................................................e.................................................................................................. + add v21.4S, v28.4S, v12.4S // ............................................................................e................................................................................................. + ldr q12, [x2, #112] // .............................................................................e................................................................................................ + mul v9.4S, v21.4S, v24.S[2] // ...............................................................................e.............................................................................................. + ldr q5, [x2], #(12*16) // ................................................................................e............................................................................................. + sqrdmulh v20.4S, v10.4S, v2.S[1] // ..................................................................................e........................................................................................... + sub v13.4S, v7.4S, v1.4S // ...................................................................................e.......................................................................................... + sqrdmulh v2.4S, v21.4S, v24.S[3] // ....................................................................................e......................................................................................... + ldr q14, [x2, #-96] // .....................................................................................e........................................................................................ + mul v19.4S, v15.4S, v24.S[0] // .......................................................................................e...................................................................................... + trn2 v30.4S, v29.4S, v13.4S // ........................................................................................e..................................................................................... + ldr q28, [x2, #-144] // .........................................................................................e.................................................................................... + mls v9.4S, v2.4S, v8.S[0] // ...........................................................................................e.................................................................................. + mls v6.4S, v20.4S, v8.S[0] // ............................................................................................e................................................................................. + mls v19.4S, v18.4S, v8.S[0] // .............................................................................................e................................................................................ + trn1 v22.4S, v29.4S, v13.4S // ..............................................................................................e............................................................................... + sub v1.4S, v23.4S, v9.4S // ...............................................................................................e.............................................................................. + ldr q15, [x2, #-160] // ................................................................................................e............................................................................. + sub v0.4S, v27.4S, v19.4S // ..................................................................................................e........................................................................... + sub v3.4S, v31.4S, v6.4S // ...................................................................................................e.......................................................................... + add v29.4S, v31.4S, v6.4S // ....................................................................................................e......................................................................... + ldr q7, [x2, #-112] // .....................................................................................................e........................................................................ + trn1 v20.4S, v29.4S, v3.4S // .......................................................................................................e...................................................................... + add v26.4S, v27.4S, v19.4S // ........................................................................................................e..................................................................... + add v21.4S, v23.4S, v9.4S // .........................................................................................................e.................................................................... + ldr q31, [x2, #-32] // ..........................................................................................................e................................................................... + trn2 v10.4S, v26.4S, v0.4S // ............................................................................................................e................................................................. + trn1 v24.4S, v26.4S, v0.4S // .............................................................................................................e................................................................ + trn1 v6.4S, v21.4S, v1.4S // ..............................................................................................................e............................................................... + ldr q4, [x2, #-128] // ...............................................................................................................e.............................................................. + trn2 v25.2D, v22.2D, v24.2D // .................................................................................................................e............................................................ + trn1 v19.2D, v30.2D, v10.2D // ..................................................................................................................e........................................................... + trn2 v13.2D, v30.2D, v10.2D // ...................................................................................................................e.......................................................... + mul v9.4S, v25.4S, v5.4S // ....................................................................................................................e......................................................... + trn2 v10.4S, v21.4S, v1.4S // .....................................................................................................................e........................................................ + trn2 v23.4S, v29.4S, v3.4S // ......................................................................................................................e....................................................... + mul v16.4S, v13.4S, v5.4S // .......................................................................................................................e...................................................... + trn2 v0.2D, v6.2D, v20.2D // ........................................................................................................................e..................................................... + trn2 v1.2D, v10.2D, v23.2D // .........................................................................................................................e.................................................... + sqrdmulh v29.4S, v13.4S, v17.4S // ..........................................................................................................................e................................................... + trn1 v11.2D, v10.2D, v23.2D // ...........................................................................................................................e.................................................. + mul v30.4S, v1.4S, v14.4S // ............................................................................................................................e................................................. + sqrdmulh v21.4S, v1.4S, v12.4S // .............................................................................................................................e................................................ + mul v1.4S, v0.4S, v14.4S // ..............................................................................................................................e............................................... + ldr q10, [x2, #-16] // ...............................................................................................................................e.............................................. + mls v16.4S, v29.4S, v8.S[0] // .................................................................................................................................e............................................ + ldr q3, [x2, #-48] // ..................................................................................................................................e........................................... + sqrdmulh v29.4S, v0.4S, v12.4S // ....................................................................................................................................e......................................... + sub v12.4S, v19.4S, v16.4S // .....................................................................................................................................e........................................ + sqrdmulh v26.4S, v25.4S, v17.4S // ......................................................................................................................................e....................................... + trn1 v5.2D, v6.2D, v20.2D // .......................................................................................................................................e...................................... + mls v30.4S, v21.4S, v8.S[0] // ........................................................................................................................................e..................................... + sqrdmulh v0.4S, v12.4S, v7.4S // .........................................................................................................................................e.................................... + add v16.4S, v19.4S, v16.4S // ..........................................................................................................................................e................................... + mul v17.4S, v12.4S, v4.4S // ...........................................................................................................................................e.................................. + trn1 v12.2D, v22.2D, v24.2D // ............................................................................................................................................e................................. + mul v23.4S, v16.4S, v15.4S // .............................................................................................................................................e................................ + sub v21.4S, v11.4S, v30.4S // ..............................................................................................................................................e............................... + mls v9.4S, v26.4S, v8.S[0] // ...............................................................................................................................................e.............................. + add v18.4S, v11.4S, v30.4S // ................................................................................................................................................e............................. + mul v20.4S, v21.4S, v31.4S // .................................................................................................................................................e............................ + sqrdmulh v27.4S, v16.4S, v28.4S // ..................................................................................................................................................e........................... + mls v1.4S, v29.4S, v8.S[0] // ...................................................................................................................................................e.......................... + ldr q15, [x2, #-64] // ....................................................................................................................................................e......................... + sqrdmulh v30.4S, v21.4S, v10.4S // ......................................................................................................................................................e....................... + mls v17.4S, v0.4S, v8.S[0] // .......................................................................................................................................................e...................... + add v22.4S, v12.4S, v9.4S // ........................................................................................................................................................e..................... + sub v12.4S, v12.4S, v9.4S // .........................................................................................................................................................e.................... + sub v9.4S, v5.4S, v1.4S // ..........................................................................................................................................................e................... + mls v20.4S, v30.4S, v8.S[0] // ...........................................................................................................................................................e.................. + add v7.4S, v5.4S, v1.4S // ............................................................................................................................................................e................. + sqrdmulh v4.4S, v18.4S, v3.4S // .............................................................................................................................................................e................ + sub v6.4S, v12.4S, v17.4S // ..............................................................................................................................................................e............... + mul v14.4S, v18.4S, v15.4S // ...............................................................................................................................................................e.............. + sub v26.4S, v9.4S, v20.4S // ................................................................................................................................................................e............. + mls v23.4S, v27.4S, v8.S[0] // .................................................................................................................................................................e............ + add v5.4S, v12.4S, v17.4S // ..................................................................................................................................................................e........... + mls v14.4S, v4.4S, v8.S[0] // ...................................................................................................................................................................e.......... + add v25.4S, v9.4S, v20.4S // ....................................................................................................................................................................e......... + add v3.4S, v22.4S, v23.4S // .....................................................................................................................................................................e........ + sub v4.4S, v22.4S, v23.4S // ......................................................................................................................................................................e....... + add v23.4S, v7.4S, v14.4S // .......................................................................................................................................................................e...... + sub v24.4S, v7.4S, v14.4S // ........................................................................................................................................................................e..... + st4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x0], #64 // .........................................................................................................................................................................e.... + + // --------------------------------------------------------------------------- cycle (expected) ----------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------- + // ldr q9, [x0, #(64 + 16*0)] // ..e......................................................................................................................................................................' + // ldr q10, [x0, #(64 + 16*1)] // ......e..................................................................................................................................................................' + // ldr q11, [x0, #(64 + 16*2)] // ........e................................................................................................................................................................' + // ldr q12, [x0, #(64 + 16*3)] // e........................................................................................................................................................................' + // ldr q13, [x5, #(64 + 16*0)] // ....e....................................................................................................................................................................' + // ldr q14, [x5, #(64 + 16*1)] // ..................e......................................................................................................................................................' + // ldr q15, [x5, #(64 + 16*2)] // .............................e...........................................................................................................................................' + // ldr q16, [x5, #(64 + 16*3)] // ............e............................................................................................................................................................' + // add x0, x0, #64 // ........e................................................................................................................................................................' + // add x5, x5, #64 // .............................e...........................................................................................................................................' + // ldr q0, [x1], #64 // ..........e..............................................................................................................................................................' + // ldr q1, [x1, #(-64 + 16)] // ........................................e................................................................................................................................' + // ldr q2, [x1, #(-64 + 32)] // .....................................................e...................................................................................................................' + // ldr q3, [x1, #(-64 + 48)] // ....................................e....................................................................................................................................' + // sqrdmulh v27.4s, v13.4s, v0.s[1] // ..............e..........................................................................................................................................................' + // mul v24.4s, v13.4s, v0.s[0] // ...............e.........................................................................................................................................................' + // mls v24.4s, v27.4s, v8.s[0] // ....................e....................................................................................................................................................' + // sub v13.4s, v9.4s, v24.4s // ........................e................................................................................................................................................' + // add v9.4s, v9.4s, v24.4s // ...........................e.............................................................................................................................................' + // sqrdmulh v27.4s, v14.4s, v0.s[1] // .......................e.................................................................................................................................................' + // mul v24.4s, v14.4s, v0.s[0] // ......................e..................................................................................................................................................' + // mls v24.4s, v27.4s, v8.s[0] // ............................e............................................................................................................................................' + // sub v14.4s, v10.4s, v24.4s // .................................e.......................................................................................................................................' + // add v10.4s, v10.4s, v24.4s // ................................e........................................................................................................................................' + // sqrdmulh v27.4s, v15.4s, v0.s[1] // ..................................e......................................................................................................................................' + // mul v24.4s, v15.4s, v0.s[0] // ...................................e.....................................................................................................................................' + // mls v24.4s, v27.4s, v8.s[0] // .......................................e.................................................................................................................................' + // sub v15.4s, v11.4s, v24.4s // ............................................e............................................................................................................................' + // add v11.4s, v11.4s, v24.4s // ...........................................e.............................................................................................................................' + // sqrdmulh v27.4s, v16.4s, v0.s[1] // ................e........................................................................................................................................................' + // mul v24.4s, v16.4s, v0.s[0] // .................e.......................................................................................................................................................' + // mls v24.4s, v27.4s, v8.s[0] // .....................e...................................................................................................................................................' + // sub v16.4s, v12.4s, v24.4s // .........................e...............................................................................................................................................' + // add v12.4s, v12.4s, v24.4s // ..........................e..............................................................................................................................................' + // sqrdmulh v27.4s, v11.4s, v0.s[3] // ...............................................e.........................................................................................................................' + // mul v24.4s, v11.4s, v0.s[2] // ..............................................e..........................................................................................................................' + // mls v24.4s, v27.4s, v8.s[0] // ...................................................e.....................................................................................................................' + // sub v11.4s, v9.4s, v24.4s // ..............................................................e..........................................................................................................' + // add v9.4s, v9.4s, v24.4s // .................................................................e.......................................................................................................' + // sqrdmulh v27.4s, v12.4s, v0.s[3] // ......................................e..................................................................................................................................' + // mul v24.4s, v12.4s, v0.s[2] // ...............................e.........................................................................................................................................' + // mls v24.4s, v27.4s, v8.s[0] // ..........................................e..............................................................................................................................' + // sub v12.4s, v10.4s, v24.4s // ....................................................e....................................................................................................................' + // add v10.4s, v10.4s, v24.4s // .................................................e.......................................................................................................................' + // sqrdmulh v27.4s, v15.4s, v1.s[1] // .......................................................e.................................................................................................................' + // mul v24.4s, v15.4s, v1.s[0] // ..................................................e......................................................................................................................' + // mls v24.4s, v27.4s, v8.s[0] // ...........................................................e.............................................................................................................' + // sub v15.4s, v13.4s, v24.4s // ...............................................................e.........................................................................................................' + // add v13.4s, v13.4s, v24.4s // ................................................................e........................................................................................................' + // sqrdmulh v27.4s, v16.4s, v1.s[1] // ................................................e........................................................................................................................' + // mul v24.4s, v16.4s, v1.s[0] // .............................................e...........................................................................................................................' + // mls v24.4s, v27.4s, v8.s[0] // .........................................................e...............................................................................................................' + // sub v16.4s, v14.4s, v24.4s // .............................................................e...........................................................................................................' + // add v14.4s, v14.4s, v24.4s // .......................................................................e.................................................................................................' + // sqrdmulh v27.4s, v10.4s, v1.s[3] // ............................................................e............................................................................................................' + // mul v24.4s, v10.4s, v1.s[2] // ........................................................e................................................................................................................' + // mls v24.4s, v27.4s, v8.s[0] // ..................................................................e......................................................................................................' + // sub v10.4s, v9.4s, v24.4s // ..............................................................................e..........................................................................................' + // add v9.4s, v9.4s, v24.4s // ......................................................................e..................................................................................................' + // sqrdmulh v27.4s, v12.4s, v2.s[1] // ..........................................................e..............................................................................................................' + // mul v24.4s, v12.4s, v2.s[0] // ..................................................................................e......................................................................................' + // mls v24.4s, v27.4s, v8.s[0] // ........................................................................................e................................................................................' + // sub v12.4s, v11.4s, v24.4s // .............................................................................................e...........................................................................' + // add v11.4s, v11.4s, v24.4s // ...................................................................................................e.....................................................................' + // sqrdmulh v27.4s, v14.4s, v2.s[3] // ...............................................................................e.........................................................................................' + // mul v24.4s, v14.4s, v2.s[2] // ..........................................................................e..............................................................................................' + // mls v24.4s, v27.4s, v8.s[0] // ......................................................................................e..................................................................................' + // sub v14.4s, v13.4s, v24.4s // ..........................................................................................e..............................................................................' + // add v13.4s, v13.4s, v24.4s // ....................................................................................................e....................................................................' + // sqrdmulh v27.4s, v16.4s, v3.s[1] // .............................................................................e...........................................................................................' + // mul v24.4s, v16.4s, v3.s[0] // ...................................................................e.....................................................................................................' + // mls v24.4s, v27.4s, v8.s[0] // .......................................................................................e.................................................................................' + // sub v16.4s, v15.4s, v24.4s // ..............................................................................................e..........................................................................' + // add v15.4s, v15.4s, v24.4s // ...............................................................................................e.........................................................................' + // trn1 v25.4s, v9.4s, v10.4s // .........................................................................................e...............................................................................' + // trn2 v26.4s, v9.4s, v10.4s // ...................................................................................e.....................................................................................' + // trn1 v27.4s, v11.4s, v12.4s // ........................................................................................................e................................................................' + // trn2 v28.4s, v11.4s, v12.4s // .......................................................................................................e.................................................................' + // trn2 v11.2d, v25.2d, v27.2d // ............................................................................................................e............................................................' + // trn2 v12.2d, v26.2d, v28.2d // ..............................................................................................................e..........................................................' + // trn1 v9.2d, v25.2d, v27.2d // .......................................................................................................................................e.................................' + // trn1 v10.2d, v26.2d, v28.2d // .............................................................................................................e...........................................................' + // trn1 v25.4s, v13.4s, v14.4s // .........................................................................................................e...............................................................' + // trn2 v26.4s, v13.4s, v14.4s // ................................................................................................................e........................................................' + // trn1 v27.4s, v15.4s, v16.4s // ..................................................................................................e......................................................................' + // trn2 v28.4s, v15.4s, v16.4s // .................................................................................................................e.......................................................' + // trn2 v15.2d, v25.2d, v27.2d // ...................................................................................................................e.....................................................' + // trn2 v16.2d, v26.2d, v28.2d // ....................................................................................................................e....................................................' + // trn1 v13.2d, v25.2d, v27.2d // ..................................................................................................................................e......................................' + // trn1 v14.2d, v26.2d, v28.2d // ......................................................................................................................e..................................................' + // ldr q0, [x2], #(12*16) // ...........................................................................e.............................................................................................' + // ldr q4, [x2, #(-12*16 + 1*16)] // ....................................................................e....................................................................................................' + // ldr q1, [x2, #(-12*16 + 2*16)] // ...........................................................................................e.............................................................................' + // ldr q5, [x2, #(-12*16 + 3*16)] // ....................................................................................e....................................................................................' + // ldr q2, [x2, #(-12*16 + 4*16)] // ..........................................................................................................e..............................................................' + // ldr q6, [x2, #(-12*16 + 5*16)] // ................................................................................................e........................................................................' + // sqrdmulh v27.4s, v11.4s, v4.4s // .................................................................................................................................e.......................................' + // mul v24.4s, v11.4s, v0.4s // ...............................................................................................................e.........................................................' + // mls v24.4s, v27.4s, v8.s[0] // ..........................................................................................................................................e..............................' + // sub v11.4s, v9.4s, v24.4s // ....................................................................................................................................................e....................' + // add v9.4s, v9.4s, v24.4s // ...................................................................................................................................................e.....................' + // sqrdmulh v27.4s, v12.4s, v4.4s // .....................................................................................................................e...................................................' + // mul v24.4s, v12.4s, v0.4s // ..................................................................................................................e......................................................' + // mls v24.4s, v27.4s, v8.s[0] // ............................................................................................................................e............................................' + // sub v12.4s, v10.4s, v24.4s // ................................................................................................................................e........................................' + // add v10.4s, v10.4s, v24.4s // .....................................................................................................................................e...................................' + // sqrdmulh v27.4s, v10.4s, v5.4s // .............................................................................................................................................e...........................' + // mul v24.4s, v10.4s, v1.4s // ........................................................................................................................................e................................' + // mls v24.4s, v27.4s, v8.s[0] // ............................................................................................................................................................e............' + // sub v10.4s, v9.4s, v24.4s // .................................................................................................................................................................e.......' + // add v9.4s, v9.4s, v24.4s // ................................................................................................................................................................e........' + // sqrdmulh v27.4s, v12.4s, v6.4s // ....................................................................................................................................e....................................' + // mul v24.4s, v12.4s, v2.4s // ......................................................................................................................................e..................................' + // mls v24.4s, v27.4s, v8.s[0] // ..................................................................................................................................................e......................' + // sub v12.4s, v11.4s, v24.4s // .........................................................................................................................................................e...............' + // add v11.4s, v11.4s, v24.4s // .............................................................................................................................................................e...........' + // ldr q0, [x2, #(-12*16 + 6*16)] // ................................................................................e........................................................................................' + // ldr q4, [x2, #(-12*16 + 7*16)] // ........................................................................e................................................................................................' + // ldr q1, [x2, #(-12*16 + 8*16)] // ...............................................................................................................................................e.........................' + // ldr q5, [x2, #(-12*16 + 9*16)] // .............................................................................................................................e...........................................' + // ldr q2, [x2, #(-12*16 + 10*16)] // .....................................................................................................e...................................................................' + // ldr q6, [x2, #(-12*16 + 11*16)] // ..........................................................................................................................e..............................................' + // sqrdmulh v27.4s, v15.4s, v4.4s // ...............................................................................................................................e.........................................' + // mul v24.4s, v15.4s, v0.4s // .........................................................................................................................e...............................................' + // mls v24.4s, v27.4s, v8.s[0] // ..............................................................................................................................................e..........................' + // sub v15.4s, v13.4s, v24.4s // .....................................................................................................................................................e...................' + // add v13.4s, v13.4s, v24.4s // .......................................................................................................................................................e.................' + // sqrdmulh v27.4s, v16.4s, v4.4s // ........................................................................................................................e................................................' + // mul v24.4s, v16.4s, v0.4s // .......................................................................................................................e.................................................' + // mls v24.4s, v27.4s, v8.s[0] // ...................................................................................................................................e.....................................' + // sub v16.4s, v14.4s, v24.4s // .........................................................................................................................................e...............................' + // add v14.4s, v14.4s, v24.4s // ...........................................................................................................................................e.............................' + // sqrdmulh v27.4s, v14.4s, v5.4s // ........................................................................................................................................................e................' + // mul v24.4s, v14.4s, v1.4s // ..........................................................................................................................................................e..............' + // mls v24.4s, v27.4s, v8.s[0] // ..............................................................................................................................................................e..........' + // sub v14.4s, v13.4s, v24.4s // ...................................................................................................................................................................e.....' + // add v13.4s, v13.4s, v24.4s // ..................................................................................................................................................................e......' + // sqrdmulh v27.4s, v16.4s, v6.4s // .................................................................................................................................................e.......................' + // mul v24.4s, v16.4s, v2.4s // ............................................................................................................................................e............................' + // mls v24.4s, v27.4s, v8.s[0] // ......................................................................................................................................................e..................' + // sub v16.4s, v15.4s, v24.4s // ...........................................................................................................................................................e.............' + // add v15.4s, v15.4s, v24.4s // ...............................................................................................................................................................e.........' + // st4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x0], #64 // ....................................................................................................................................................................e....' + // st4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x5], #64 // .........................................................................................................................................................................* + + sub count, count, 1 cbnz count, layer45678_start + // Instructions: 1 + // Expected cycles: 1 + // Expected IPC: 1.00 + // + // Cycle bound: 1.0 + // IPC bound: 1.00 + // + // Wall time: 0.00s + // User time: 0.00s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + st4 {v23.4S, v24.4S, v25.4S, v26.4S}, [x5], #64 // *............................. + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // st4 {v23.4S, v24.4S, v25.4S, v26.4S}, [x5], #64 // *.............................. + pop_stack ret From 9ad80fbecd2bf795e5ef71e142b234f04476584f Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Thu, 15 May 2025 16:14:59 +0800 Subject: [PATCH 2/3] Switch AArch64 INTT to a 5+3 layer merge Signed-off-by: Matthias J. Kannwischer --- mldsa/native/aarch64/src/aarch64_zetas.c | 46 +-- mldsa/native/aarch64/src/intt.S | 388 ++++++++++------------- scripts/autogen | 22 +- 3 files changed, 208 insertions(+), 248 deletions(-) diff --git a/mldsa/native/aarch64/src/aarch64_zetas.c b/mldsa/native/aarch64/src/aarch64_zetas.c index 3822ef8a..d12bf51c 100644 --- a/mldsa/native/aarch64/src/aarch64_zetas.c +++ b/mldsa/native/aarch64/src/aarch64_zetas.c @@ -194,28 +194,30 @@ MLD_ALIGN const int32_t mld_aarch64_intt_zetas_layer78[] = { }; MLD_ALIGN const int32_t mld_aarch64_intt_zetas_layer123456[] = { - -2283733, -585207070, -1858416, -476219497, -3345963, -857403734, - -2815639, -721508096, -1853806, -475038184, -2917338, -747568486, - 3585098, 918682129, -3870317, -991769559, -556856, -142694469, - 642628, 164673562, -3192354, -818041395, 2897314, 742437332, - -1460718, -374309300, 3950053, 1012201926, 1716988, 439978542, - -2453983, -628833668, 1935799, 496048908, -3756790, -962678241, - -1714295, -439288460, 3574466, 915957677, 817536, 209493775, - 3227876, 827143915, -1759347, -450833045, -3415069, -875112161, - 1335936, 342333886, -2156050, -552488273, -3241972, -830756018, - -676590, -173376332, 4018989, 1029866791, -2071829, -530906624, - 434125, 111244624, 3506380, 898510625, -1095468, -280713909, - 3524442, 903139016, -928749, -237992130, -394148, -101000509, - 1674615, 429120452, -1159875, -297218217, -3704823, -949361686, - -2663378, -682491182, -2101410, -538486762, 3110818, 797147778, - 4063053, 1041158200, 3586446, 919027554, -2740543, -702264730, - 3370349, 863652652, -3182878, -815613168, -3602218, -923069133, - -294725, -75523344, -3761513, -963888510, -3765607, -964937599, - 3201430, 820367122, 3145678, 806080660, 2883726, 738955404, - 3201494, 820383522, 1221177, 312926867, -557458, -142848732, - 1005239, 257592709, -3764867, -964747974, -2129892, -545785280, - -2682288, -687336873, -3542485, -907762539, 601683, 154181397, - 0, 0, + 1221177, 312926867, -2283733, -585207070, -2815639, -721508096, + -1858416, -476219497, -3345963, -857403734, -1853806, -475038184, + -2917338, -747568486, 0, 0, -557458, -142848732, + 3585098, 918682129, 642628, 164673562, -3870317, -991769559, + -556856, -142694469, -3192354, -818041395, 2897314, 742437332, + 0, 0, 1005239, 257592709, -1460718, -374309300, + -2453983, -628833668, 3950053, 1012201926, 1716988, 439978542, + 1935799, 496048908, -3756790, -962678241, 0, 0, + -3764867, -964747974, -1714295, -439288460, 3227876, 827143915, + 3574466, 915957677, 817536, 209493775, -1759347, -450833045, + -3415069, -875112161, 0, 0, -2129892, -545785280, + 1335936, 342333886, -676590, -173376332, -2156050, -552488273, + -3241972, -830756018, 4018989, 1029866791, -2071829, -530906624, + 0, 0, -2682288, -687336873, 434125, 111244624, + 3524442, 903139016, 3506380, 898510625, -1095468, -280713909, + -928749, -237992130, -394148, -101000509, 0, 0, + -3542485, -907762539, 1674615, 429120452, -2663378, -682491182, + -1159875, -297218217, -3704823, -949361686, -2101410, -538486762, + 3110818, 797147778, 0, 0, 601683, 154181397, + 4063053, 1041158200, 3370349, 863652652, 3586446, 919027554, + -2740543, -702264730, -3182878, -815613168, -3602218, -923069133, + 0, 0, -294725, -75523344, -3761513, -963888510, + -3765607, -964937599, 3201430, 820367122, 3145678, 806080660, + 2883726, 738955404, 3201494, 820383522, 0, 0, }; #else /* MLD_ARITH_BACKEND_AARCH64 */ diff --git a/mldsa/native/aarch64/src/intt.S b/mldsa/native/aarch64/src/intt.S index 7591271e..25792929 100644 --- a/mldsa/native/aarch64/src/intt.S +++ b/mldsa/native/aarch64/src/intt.S @@ -51,43 +51,43 @@ mulmod \b, tmp, \root, \root_twisted .endm -.macro mul_ninv dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src0, src1, src2, src3, src4, src5, src6, src7 +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 mulmod \dst0, \src0, ninv, ninv_tw mulmod \dst1, \src1, ninv, ninv_tw mulmod \dst2, \src2, ninv, ninv_tw mulmod \dst3, \src3, ninv, ninv_tw - mulmod \dst4, \src4, ninv, ninv_tw - mulmod \dst5, \src5, ninv, ninv_tw - mulmod \dst6, \src6, ninv, ninv_tw - mulmod \dst7, \src7, ninv, ninv_tw .endm -.macro load_roots_1234 r_ptr - ldr q_root0, [\r_ptr], #(8*16) - ldr q_root1, [\r_ptr, #(-8*16 + 1*16)] - ldr q_root2, [\r_ptr, #(-8*16 + 2*16)] - ldr q_root3, [\r_ptr, #(-8*16 + 3*16)] - ldr q_root4, [\r_ptr, #(-8*16 + 4*16)] - ldr q_root5, [\r_ptr, #(-8*16 + 5*16)] - ldr q_root6, [\r_ptr, #(-8*16 + 6*16)] - ldr q_root7, [\r_ptr, #(-8*16 + 7*16)] +.macro load_roots_123 + ldr q_root0, [r123456_ptr], #64 + ldr q_root1, [r123456_ptr, #(-64 + 16)] + ldr q_root2, [r123456_ptr, #(-64 + 32)] + ldr q_root3, [r123456_ptr, #(-64 + 48)] .endm -.macro load_next_roots_56 root0, r_ptr0 - ldr q_\root0, [\r_ptr0], #16 +.macro load_roots_456 + ldr q_root0, [r123456_ptr], #64 + ldr q_root1, [r123456_ptr, #(-64 + 16)] + ldr q_root2, [r123456_ptr, #(-64 + 32)] + ldr q_root3, [r123456_ptr, #(-64 + 48)] .endm -.macro load_next_roots_6 root0, r_ptr0 - ldr q_\root0, [\r_ptr0], #8 +.macro load_roots_78_part1 + ldr q_root0, [r78_ptr], #(12*16) + ldr q_root0_tw, [r78_ptr, #(-12*16 + 1*16)] + ldr q_root1, [r78_ptr, #(-12*16 + 2*16)] + ldr q_root1_tw, [r78_ptr, #(-12*16 + 3*16)] + ldr q_root2, [r78_ptr, #(-12*16 + 4*16)] + ldr q_root2_tw, [r78_ptr, #(-12*16 + 5*16)] .endm -.macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr q_\root0, [\r_ptr1], #(6*16) - ldr q_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] - ldr q_\root1, [\r_ptr1, #(-6*16 + 2*16)] - ldr q_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] - ldr q_\root2, [\r_ptr1, #(-6*16 + 4*16)] - ldr q_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] +.macro load_roots_78_part2 + ldr q_root0, [r78_ptr, #(-12*16 + 6*16)] + ldr q_root0_tw, [r78_ptr, #(-12*16 + 7*16)] + ldr q_root1, [r78_ptr, #(-12*16 + 8*16)] + ldr q_root1_tw, [r78_ptr, #(-12*16 + 9*16)] + ldr q_root2, [r78_ptr, #(-12*16 + 10*16)] + ldr q_root2_tw, [r78_ptr, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -132,141 +132,141 @@ MLD_ASM_FN_SYMBOL(intt_asm) push_stack - in .req x0 - r5678_ptr .req x1 - r1234_ptr .req x2 - inp .req x3 - count .req x4 - xtmp .req x5 - - wtmp .req w5 - - data0 .req v8 - data1 .req v9 - data2 .req v10 - data3 .req v11 - data4 .req v12 - data5 .req v13 - data6 .req v14 - data7 .req v15 - data8 .req v16 - data9 .req v17 - data10 .req v18 - data11 .req v19 - data12 .req v20 - data13 .req v21 - data14 .req v22 - data15 .req v23 - - q_data0 .req q8 - q_data1 .req q9 - q_data2 .req q10 - q_data3 .req q11 - q_data4 .req q12 - q_data5 .req q13 - q_data6 .req q14 - q_data7 .req q15 - q_data8 .req q16 - q_data9 .req q17 - q_data10 .req q18 - q_data11 .req q19 - q_data12 .req q20 - q_data13 .req q21 - q_data14 .req q22 - q_data15 .req q23 - - root0 .req v0 - root1 .req v1 - root2 .req v2 - root3 .req v3 + in .req x0 + r78_ptr .req x1 + r123456_ptr .req x2 + + inp .req x3 + inpp .req x4 + count .req x5 + xtmp .req x6 + wtmp .req w6 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + + q_data0 .req q9 + q_data1 .req q10 + q_data2 .req q11 + q_data3 .req q12 + q_data4 .req q13 + q_data5 .req q14 + q_data6 .req q15 + q_data7 .req q16 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + + q_root0 .req q0 + q_root1 .req q1 + q_root2 .req q2 + q_root3 .req q3 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + modulus .req v8 + q_modulus .req q8 + + mov inp, in + add inpp, inp, #64 + mov count, #8 + root0_tw .req v4 root1_tw .req v5 root2_tw .req v6 root3_tw .req v7 - - - q_root0 .req q0 - q_root1 .req q1 - q_root2 .req q2 - q_root3 .req q3 q_root0_tw .req q4 q_root1_tw .req q5 q_root2_tw .req q6 q_root3_tw .req q7 - - tmp .req v24 - q_tmp .req q24 - t0 .req v25 - t1 .req v26 - t2 .req v27 - t3 .req v28 - - modulus .req v29 - // load q = 8380417 movz wtmp, #57345 movk wtmp, #127, lsl #16 dup modulus.4s, wtmp - mov inp, in - - mov count, #16 - .p2align 2 -layer5678_start: - - ld4 {data0.4S, data1.4S, data2.4S, data3.4S}, [in] - - load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r5678_ptr +layer45678_start: + ld4 {data0.4S, data1.4S, data2.4S, data3.4S}, [inp] + ld4 {data4.4S, data5.4S, data6.4S, data7.4S}, [inpp] + load_roots_78_part1 + + // Layer 8 Part 1 gs_butterfly_v data0, data1, root1, root1_tw gs_butterfly_v data2, data3, root2, root2_tw + // Layer 7 Part 1 gs_butterfly_v data0, data2, root0, root0_tw gs_butterfly_v data1, data3, root0, root0_tw + load_roots_78_part2 + + // Layer 8 Part 2 + gs_butterfly_v data4, data5, root1, root1_tw + gs_butterfly_v data6, data7, root2, root2_tw + // Layer 7 Part 2 + gs_butterfly_v data4, data6, root0, root0_tw + gs_butterfly_v data5, data7, root0, root0_tw + transpose4 data0, data1, data2, data3 + transpose4 data4, data5, data6, data7 - load_next_roots_6 root1, r1234_ptr - load_next_roots_56 root0, r1234_ptr + load_roots_456 - gs_butterfly data0, data1, root0, 0, 1 - gs_butterfly data2, data3, root0, 2, 3 - gs_butterfly data0, data2, root1, 0, 1 - gs_butterfly data1, data3, root1, 0, 1 + // Layer 6 + gs_butterfly data0, data1, root1, 2, 3 + gs_butterfly data2, data3, root2, 0, 1 + gs_butterfly data4, data5, root2, 2, 3 + gs_butterfly data6, data7, root3, 0, 1 - str q_data0, [in], #(16*4) - str q_data1, [in, #(-16*4 + 1*16)] - str q_data2, [in, #(-16*4 + 2*16)] - str q_data3, [in, #(-16*4 + 3*16)] + // Layer 5 + gs_butterfly data0, data2, root0, 2, 3 + gs_butterfly data1, data3, root0, 2, 3 + gs_butterfly data4, data6, root1, 0, 1 + gs_butterfly data5, data7, root1, 0, 1 + + // Layer 4 + gs_butterfly data0, data4, root0, 0, 1 + gs_butterfly data1, data5, root0, 0, 1 + gs_butterfly data2, data6, root0, 0, 1 + gs_butterfly data3, data7, root0, 0, 1 + + // Standard way using vector instructions + + str q_data0, [inp], #(16*4) + str q_data1, [inp, #(-16*4 + 1*16)] + str q_data2, [inp, #(-16*4 + 2*16)] + str q_data3, [inp, #(-16*4 + 3*16)] + + str q_data4, [inpp], #(16*4) + str q_data5, [inpp, #(-16*4 + 1*16)] + str q_data6, [inpp, #(-16*4 + 2*16)] + str q_data7, [inpp, #(-16*4 + 3*16)] + + add inp, inp, #64 + add inpp, inpp, #64 subs count, count, #1 - cbnz count, layer5678_start - - .unreq root0_tw - .unreq root1_tw - .unreq root2_tw - .unreq root3_tw - .unreq q_root0_tw - .unreq q_root1_tw - .unreq q_root2_tw - .unreq q_root3_tw - .unreq t0 - .unreq t1 - - root4 .req v4 - root5 .req v5 - root6 .req v6 - root7 .req v7 - q_root4 .req q4 - q_root5 .req q5 - q_root6 .req q6 - q_root7 .req q7 + cbnz count, layer45678_start + ninv .req v25 ninv_tw .req v26 - mov in, inp - mov count, #4 + + mov count, #8 + // load ninv mov wtmp, #16382 // 2^(32 - 8) mod Q @@ -277,94 +277,54 @@ layer5678_start: movk wtmp, #64, lsl #16 dup ninv_tw.4s, wtmp - load_roots_1234 r1234_ptr + load_roots_123 .p2align 2 -layer1234_start: - ldr q_data0, [in, #(0*(512/8))] - ldr q_data1, [in, #(1*(512/8))] - ldr q_data2, [in, #(2*(512/8))] - ldr q_data3, [in, #(3*(512/8))] - ldr q_data4, [in, #(4*(512/8))] - ldr q_data5, [in, #(5*(512/8))] - ldr q_data6, [in, #(6*(512/8))] - ldr q_data7, [in, #(7*(512/8))] - ldr q_data8, [in, #(8*(512/8))] - ldr q_data9, [in, #(9*(512/8))] - ldr q_data10, [in, #(10*(512/8))] - ldr q_data11, [in, #(11*(512/8))] - ldr q_data12, [in, #(12*(512/8))] - ldr q_data13, [in, #(13*(512/8))] - ldr q_data14, [in, #(14*(512/8))] - ldr q_data15, [in, #(15*(512/8))] - - // layer4 - gs_butterfly data0, data1, root3, 2, 3 - gs_butterfly data2, data3, root4, 0, 1 - gs_butterfly data4, data5, root4, 2, 3 - gs_butterfly data6, data7, root5, 0, 1 - gs_butterfly data8, data9, root5, 2, 3 - gs_butterfly data10, data11, root6, 0, 1 - gs_butterfly data12, data13, root6, 2, 3 - gs_butterfly data14, data15, root7, 0, 1 - - // layer3 - gs_butterfly data0, data2, root1, 2, 3 - gs_butterfly data1, data3, root1, 2, 3 - gs_butterfly data4, data6, root2, 0, 1 - gs_butterfly data5, data7, root2, 0, 1 - gs_butterfly data8, data10, root2, 2, 3 - gs_butterfly data9, data11, root2, 2, 3 - gs_butterfly data12, data14, root3, 0, 1 - gs_butterfly data13, data15, root3, 0, 1 - - // layer2 - gs_butterfly data0, data4, root0, 2, 3 - gs_butterfly data1, data5, root0, 2, 3 - gs_butterfly data2, data6, root0, 2, 3 - gs_butterfly data3, data7, root0, 2, 3 - gs_butterfly data8, data12, root1, 0, 1 - gs_butterfly data9, data13, root1, 0, 1 - gs_butterfly data10, data14, root1, 0, 1 - gs_butterfly data11, data15, root1, 0, 1 - - // layer 1 - gs_butterfly data0, data8, root0, 0, 1 - gs_butterfly data1, data9, root0, 0, 1 - gs_butterfly data2, data10, root0, 0, 1 - gs_butterfly data3, data11, root0, 0, 1 - gs_butterfly data4, data12, root0, 0, 1 - gs_butterfly data5, data13, root0, 0, 1 - gs_butterfly data6, data14, root0, 0, 1 - gs_butterfly data7, data15, root0, 0, 1 - - str q_data8, [in, #(8*(512/8))] - str q_data9, [in, #(9*(512/8))] - str q_data10, [in, #(10*(512/8))] - str q_data11, [in, #(11*(512/8))] - str q_data12, [in, #(12*(512/8))] - str q_data13, [in, #(13*(512/8))] - str q_data14, [in, #(14*(512/8))] - str q_data15, [in, #(15*(512/8))] - - // Scale half the coeffs 2^-8 and the Montgomery factor 2^32. - // For the other half, the scaling has been merged into the - // multiplication with the twiddle factor on the last layer. - mul_ninv data0, data1, data2, data3, data4, data5, data6, data7, data0, data1, data2, data3, data4, data5, data6, data7 +layer123_start: + + ldr q_data0, [in, #(0*(1024/8))] + ldr q_data1, [in, #(1*(1024/8))] + ldr q_data2, [in, #(2*(1024/8))] + ldr q_data3, [in, #(3*(1024/8))] + ldr q_data4, [in, #(4*(1024/8))] + ldr q_data5, [in, #(5*(1024/8))] + ldr q_data6, [in, #(6*(1024/8))] + ldr q_data7, [in, #(7*(1024/8))] + + gs_butterfly data0, data1, root1, 2, 3 + gs_butterfly data2, data3, root2, 0, 1 + gs_butterfly data4, data5, root2, 2, 3 + gs_butterfly data6, data7, root3, 0, 1 + + gs_butterfly data0, data2, root0, 2, 3 + gs_butterfly data1, data3, root0, 2, 3 + gs_butterfly data4, data6, root1, 0, 1 + gs_butterfly data5, data7, root1, 0, 1 + + // root0[0] includes ninv, manually computed. + gs_butterfly data0, data4, root0, 0, 1 + gs_butterfly data1, data5, root0, 0, 1 + gs_butterfly data2, data6, root0, 0, 1 + gs_butterfly data3, data7, root0, 0, 1 + + str q_data4, [in, #(4*(1024/8))] + str q_data5, [in, #(5*(1024/8))] + str q_data6, [in, #(6*(1024/8))] + str q_data7, [in, #(7*(1024/8))] + + // Scale half the coeffs by 1/n; for the other half, the scaling has + // been merged into the multiplication with the twiddle factor on the + // last layer. + mul_ninv data0, data1, data2, data3, data0, data1, data2, data3 str q_data0, [in], #(16) - str q_data1, [in, #(-16 + 1*(512/8))] - str q_data2, [in, #(-16 + 2*(512/8))] - str q_data3, [in, #(-16 + 3*(512/8))] - str q_data4, [in, #(-16 + 4*(512/8))] - str q_data5, [in, #(-16 + 5*(512/8))] - str q_data6, [in, #(-16 + 6*(512/8))] - str q_data7, [in, #(-16 + 7*(512/8))] + str q_data1, [in, #(-16 + 1*(1024/8))] + str q_data2, [in, #(-16 + 2*(1024/8))] + str q_data3, [in, #(-16 + 3*(1024/8))] subs count, count, #1 - cbnz count, layer1234_start - - pop_stack - ret + cbnz count, layer123_start -#endif /* MLD_ARITH_BACKEND_AARCH64 */ + pop_stack + ret +#endif diff --git a/scripts/autogen b/scripts/autogen index 13132564..cb379aeb 100755 --- a/scripts/autogen +++ b/scripts/autogen @@ -605,10 +605,16 @@ def gen_aarch64_intt_zetas_layer78(): def gen_aarch64_intt_zetas_layer123456(): - for i in range(16): - yield from gen_aarch64_root_of_unity_for_block(4, i, inv=True) - yield from gen_aarch64_root_of_unity_for_block(5, i * 2, inv=True) - yield from gen_aarch64_root_of_unity_for_block(5, i * 2 + 1, inv=True) + for i in range(8): + yield from gen_aarch64_root_of_unity_for_block(3, i, inv=True) + + yield from gen_aarch64_root_of_unity_for_block(4, i * 2, inv=True) + yield from gen_aarch64_root_of_unity_for_block(4, i * 2 + 1, inv=True) + yield from gen_aarch64_root_of_unity_for_block(5, i * 4, inv=True) + yield from gen_aarch64_root_of_unity_for_block(5, i * 4 + 1, inv=True) + yield from gen_aarch64_root_of_unity_for_block(5, i * 4 + 2, inv=True) + yield from gen_aarch64_root_of_unity_for_block(5, i * 4 + 3, inv=True) + yield from (0, 0) # Padding # The last layer has the scaling by 1/256 integrated in the twiddle yield from gen_aarch64_root_of_unity_for_block(0, 0, inv=True, scale=True) @@ -619,14 +625,6 @@ def gen_aarch64_intt_zetas_layer123456(): yield from gen_aarch64_root_of_unity_for_block(2, 1, inv=True) yield from gen_aarch64_root_of_unity_for_block(2, 2, inv=True) yield from gen_aarch64_root_of_unity_for_block(2, 3, inv=True) - yield from gen_aarch64_root_of_unity_for_block(3, 0, inv=True) - yield from gen_aarch64_root_of_unity_for_block(3, 1, inv=True) - yield from gen_aarch64_root_of_unity_for_block(3, 2, inv=True) - yield from gen_aarch64_root_of_unity_for_block(3, 3, inv=True) - yield from gen_aarch64_root_of_unity_for_block(3, 4, inv=True) - yield from gen_aarch64_root_of_unity_for_block(3, 5, inv=True) - yield from gen_aarch64_root_of_unity_for_block(3, 6, inv=True) - yield from gen_aarch64_root_of_unity_for_block(3, 7, inv=True) yield from (0, 0) # Padding From 652e61027192d9bdb29c5163cb507e7f6bac676f Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Thu, 15 May 2025 17:49:35 +0800 Subject: [PATCH 3/3] Run AArch64 INTT through SLOTHY Signed-off-by: Matthias J. Kannwischer --- dev/aarch64_clean/src/intt.S | 330 ++++++++ mldsa/native/aarch64/src/Makefile | 5 +- mldsa/native/aarch64/src/intt.S | 1318 ++++++++++++++++++++++++++--- 3 files changed, 1550 insertions(+), 103 deletions(-) create mode 100644 dev/aarch64_clean/src/intt.S diff --git a/dev/aarch64_clean/src/intt.S b/dev/aarch64_clean/src/intt.S new file mode 100644 index 00000000..25792929 --- /dev/null +++ b/dev/aarch64_clean/src/intt.S @@ -0,0 +1,330 @@ +/* Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Hanno Becker + * Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_AARCH64) + +.macro mulmodq dst, src, const, idx0, idx1 + sqrdmulh t2.4s, \src\().4s, \const\().s[\idx1\()] + mul \dst\().4s, \src\().4s, \const\().s[\idx0\()] + mls \dst\().4s, t2.4s, modulus.s[0] +.endm + +.macro mulmod dst, src, const, const_twisted + sqrdmulh t2.4s, \src\().4s, \const_twisted\().4s + mul \dst\().4s, \src\().4s, \const\().4s + mls \dst\().4s, t2.4s, modulus.s[0] +.endm + + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro load_roots_123 + ldr q_root0, [r123456_ptr], #64 + ldr q_root1, [r123456_ptr, #(-64 + 16)] + ldr q_root2, [r123456_ptr, #(-64 + 32)] + ldr q_root3, [r123456_ptr, #(-64 + 48)] +.endm + +.macro load_roots_456 + ldr q_root0, [r123456_ptr], #64 + ldr q_root1, [r123456_ptr, #(-64 + 16)] + ldr q_root2, [r123456_ptr, #(-64 + 32)] + ldr q_root3, [r123456_ptr, #(-64 + 48)] +.endm + +.macro load_roots_78_part1 + ldr q_root0, [r78_ptr], #(12*16) + ldr q_root0_tw, [r78_ptr, #(-12*16 + 1*16)] + ldr q_root1, [r78_ptr, #(-12*16 + 2*16)] + ldr q_root1_tw, [r78_ptr, #(-12*16 + 3*16)] + ldr q_root2, [r78_ptr, #(-12*16 + 4*16)] + ldr q_root2_tw, [r78_ptr, #(-12*16 + 5*16)] +.endm + +.macro load_roots_78_part2 + ldr q_root0, [r78_ptr, #(-12*16 + 6*16)] + ldr q_root0_tw, [r78_ptr, #(-12*16 + 7*16)] + ldr q_root1, [r78_ptr, #(-12*16 + 8*16)] + ldr q_root1_tw, [r78_ptr, #(-12*16 + 9*16)] + ldr q_root2, [r78_ptr, #(-12*16 + 10*16)] + ldr q_root2_tw, [r78_ptr, #(-12*16 + 11*16)] +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro save_vregs + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +.macro push_stack + save_vregs +.endm + +.macro pop_stack + restore_vregs +.endm + +.text +.global MLD_ASM_NAMESPACE(intt_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(intt_asm) + push_stack + + in .req x0 + r78_ptr .req x1 + r123456_ptr .req x2 + + inp .req x3 + inpp .req x4 + count .req x5 + xtmp .req x6 + wtmp .req w6 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + + q_data0 .req q9 + q_data1 .req q10 + q_data2 .req q11 + q_data3 .req q12 + q_data4 .req q13 + q_data5 .req q14 + q_data6 .req q15 + q_data7 .req q16 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + + q_root0 .req q0 + q_root1 .req q1 + q_root2 .req q2 + q_root3 .req q3 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + modulus .req v8 + q_modulus .req q8 + + mov inp, in + add inpp, inp, #64 + mov count, #8 + + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + q_root0_tw .req q4 + q_root1_tw .req q5 + q_root2_tw .req q6 + q_root3_tw .req q7 + + // load q = 8380417 + movz wtmp, #57345 + movk wtmp, #127, lsl #16 + dup modulus.4s, wtmp + + .p2align 2 +layer45678_start: + ld4 {data0.4S, data1.4S, data2.4S, data3.4S}, [inp] + ld4 {data4.4S, data5.4S, data6.4S, data7.4S}, [inpp] + + load_roots_78_part1 + + // Layer 8 Part 1 + gs_butterfly_v data0, data1, root1, root1_tw + gs_butterfly_v data2, data3, root2, root2_tw + // Layer 7 Part 1 + gs_butterfly_v data0, data2, root0, root0_tw + gs_butterfly_v data1, data3, root0, root0_tw + + load_roots_78_part2 + + // Layer 8 Part 2 + gs_butterfly_v data4, data5, root1, root1_tw + gs_butterfly_v data6, data7, root2, root2_tw + // Layer 7 Part 2 + gs_butterfly_v data4, data6, root0, root0_tw + gs_butterfly_v data5, data7, root0, root0_tw + + transpose4 data0, data1, data2, data3 + transpose4 data4, data5, data6, data7 + + load_roots_456 + + // Layer 6 + gs_butterfly data0, data1, root1, 2, 3 + gs_butterfly data2, data3, root2, 0, 1 + gs_butterfly data4, data5, root2, 2, 3 + gs_butterfly data6, data7, root3, 0, 1 + + // Layer 5 + gs_butterfly data0, data2, root0, 2, 3 + gs_butterfly data1, data3, root0, 2, 3 + gs_butterfly data4, data6, root1, 0, 1 + gs_butterfly data5, data7, root1, 0, 1 + + // Layer 4 + gs_butterfly data0, data4, root0, 0, 1 + gs_butterfly data1, data5, root0, 0, 1 + gs_butterfly data2, data6, root0, 0, 1 + gs_butterfly data3, data7, root0, 0, 1 + + // Standard way using vector instructions + + str q_data0, [inp], #(16*4) + str q_data1, [inp, #(-16*4 + 1*16)] + str q_data2, [inp, #(-16*4 + 2*16)] + str q_data3, [inp, #(-16*4 + 3*16)] + + str q_data4, [inpp], #(16*4) + str q_data5, [inpp, #(-16*4 + 1*16)] + str q_data6, [inpp, #(-16*4 + 2*16)] + str q_data7, [inpp, #(-16*4 + 3*16)] + + add inp, inp, #64 + add inpp, inpp, #64 + + subs count, count, #1 + cbnz count, layer45678_start + + ninv .req v25 + ninv_tw .req v26 + + + mov count, #8 + + + // load ninv + mov wtmp, #16382 // 2^(32 - 8) mod Q + dup ninv.4s, wtmp + + // load ninv_tw = 4197891 + movz wtmp, #3587 + movk wtmp, #64, lsl #16 + dup ninv_tw.4s, wtmp + + load_roots_123 + + .p2align 2 +layer123_start: + + ldr q_data0, [in, #(0*(1024/8))] + ldr q_data1, [in, #(1*(1024/8))] + ldr q_data2, [in, #(2*(1024/8))] + ldr q_data3, [in, #(3*(1024/8))] + ldr q_data4, [in, #(4*(1024/8))] + ldr q_data5, [in, #(5*(1024/8))] + ldr q_data6, [in, #(6*(1024/8))] + ldr q_data7, [in, #(7*(1024/8))] + + gs_butterfly data0, data1, root1, 2, 3 + gs_butterfly data2, data3, root2, 0, 1 + gs_butterfly data4, data5, root2, 2, 3 + gs_butterfly data6, data7, root3, 0, 1 + + gs_butterfly data0, data2, root0, 2, 3 + gs_butterfly data1, data3, root0, 2, 3 + gs_butterfly data4, data6, root1, 0, 1 + gs_butterfly data5, data7, root1, 0, 1 + + // root0[0] includes ninv, manually computed. + gs_butterfly data0, data4, root0, 0, 1 + gs_butterfly data1, data5, root0, 0, 1 + gs_butterfly data2, data6, root0, 0, 1 + gs_butterfly data3, data7, root0, 0, 1 + + str q_data4, [in, #(4*(1024/8))] + str q_data5, [in, #(5*(1024/8))] + str q_data6, [in, #(6*(1024/8))] + str q_data7, [in, #(7*(1024/8))] + + // Scale half the coeffs by 1/n; for the other half, the scaling has + // been merged into the multiplication with the twiddle factor on the + // last layer. + mul_ninv data0, data1, data2, data3, data0, data1, data2, data3 + + str q_data0, [in], #(16) + str q_data1, [in, #(-16 + 1*(1024/8))] + str q_data2, [in, #(-16 + 2*(1024/8))] + str q_data3, [in, #(-16 + 3*(1024/8))] + + subs count, count, #1 + cbnz count, layer123_start + + pop_stack + ret +#endif diff --git a/mldsa/native/aarch64/src/Makefile b/mldsa/native/aarch64/src/Makefile index 8e39f142..4f122613 100644 --- a/mldsa/native/aarch64/src/Makefile +++ b/mldsa/native/aarch64/src/Makefile @@ -20,7 +20,6 @@ SLOTHY_FLAGS=-c sw_pipelining.enabled=true \ -c sw_pipelining.allow_post \ -c variable_size \ -c constraints.stalls_first_attempt=64 \ - -c timeout=120 \ $(SLOTHY_EXTRA_FLAGS) # For kernels which stash callee-saved v8-v15 but don't stash callee-saved GPRs x19-x30. @@ -38,5 +37,9 @@ all: ntt.S ntt.S: ../../../../dev/aarch64_clean/src/ntt.S slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l layer123_start -l layer45678_start $(SLOTHY_FLAGS) $(RESERVE_X_ONLY_FLAG) +intt.S: ../../../../dev/aarch64_clean/src/intt.S + slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l layer123_start -l layer45678_start $(SLOTHY_FLAGS) $(RESERVE_X_ONLY_FLAG) + + clean: -$(RM) -rf *.S diff --git a/mldsa/native/aarch64/src/intt.S b/mldsa/native/aarch64/src/intt.S index 25792929..b19b3caf 100644 --- a/mldsa/native/aarch64/src/intt.S +++ b/mldsa/native/aarch64/src/intt.S @@ -198,75 +198,647 @@ MLD_ASM_FN_SYMBOL(intt_asm) dup modulus.4s, wtmp .p2align 2 + // Instructions: 124 + // Expected cycles: 156 + // Expected IPC: 0.79 + // + // Cycle bound: 156.0 + // IPC bound: 0.79 + // + // Wall time: 1.97s + // User time: 1.97s + // + // -------------------------------------------------------------------- cycle (expected) ---------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----- + ld4 {v4.4S, v5.4S, v6.4S, v7.4S}, [x4] // *........................................................................................................................................................... + ldr q1, [x1, #144] // .........*.................................................................................................................................................. + add v12.4S, v4.4S, v5.4S // ...........*................................................................................................................................................ + sub v24.4S, v4.4S, v5.4S // ............*............................................................................................................................................... + add v23.4S, v6.4S, v7.4S // .............*.............................................................................................................................................. + sub v22.4S, v6.4S, v7.4S // ..............*............................................................................................................................................. + ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x3] // ...............*............................................................................................................................................ + sqrdmulh v18.4S, v24.4S, v1.4S // ........................*................................................................................................................................... + add v27.4S, v12.4S, v23.4S // .........................*.................................................................................................................................. + sub v19.4S, v12.4S, v23.4S // ..........................*................................................................................................................................. + ldr q4, [x1, #128] // ...........................*................................................................................................................................ + ldr q17, [x1, #160] // .............................*.............................................................................................................................. + mul v0.4S, v24.4S, v4.4S // ...............................*............................................................................................................................ + ldr q24, [x1, #176] // ................................*........................................................................................................................... + mul v7.4S, v22.4S, v17.4S // ..................................*......................................................................................................................... + sub v2.4S, v13.4S, v14.4S // ...................................*........................................................................................................................ + sqrdmulh v31.4S, v22.4S, v24.4S // ....................................*....................................................................................................................... + add v11.4S, v13.4S, v14.4S // .....................................*...................................................................................................................... + sub v6.4S, v15.4S, v16.4S // ......................................*..................................................................................................................... + add v15.4S, v15.4S, v16.4S // .......................................*.................................................................................................................... + ldr q9, [x1, #48] // ........................................*................................................................................................................... + ldr q3, [x1, #80] // ..........................................*................................................................................................................. + ldr q26, [x1, #64] // ............................................*............................................................................................................... + ldr q10, [x1, #32] // ..............................................*............................................................................................................. + mls v0.4S, v18.4S, v8.S[0] // ................................................*........................................................................................................... + sqrdmulh v9.4S, v2.4S, v9.4S // .................................................*.......................................................................................................... + mls v7.4S, v31.4S, v8.S[0] // ..................................................*......................................................................................................... + sqrdmulh v18.4S, v6.4S, v3.4S // ...................................................*........................................................................................................ + ldr q30, [x1, #96] // ....................................................*....................................................................................................... + add v24.4S, v0.4S, v7.4S // ......................................................*..................................................................................................... + ldr q29, [x1, #112] // .......................................................*.................................................................................................... + trn2 v14.4S, v27.4S, v24.4S // .........................................................*.................................................................................................. + trn1 v27.4S, v27.4S, v24.4S // ..........................................................*................................................................................................. + mul v4.4S, v19.4S, v30.4S // ...........................................................*................................................................................................ + sqrdmulh v17.4S, v19.4S, v29.4S // ............................................................*............................................................................................... + mul v16.4S, v6.4S, v26.4S // .............................................................*.............................................................................................. + mul v24.4S, v2.4S, v10.4S // ..............................................................*............................................................................................. + sub v19.4S, v0.4S, v7.4S // ...............................................................*............................................................................................ + ldr q31, [x1, #16] // ................................................................*........................................................................................... + mls v16.4S, v18.4S, v8.S[0] // ..................................................................*......................................................................................... + mls v24.4S, v9.4S, v8.S[0] // ...................................................................*........................................................................................ + ldr q1, [x1], #(12*16) // ....................................................................*....................................................................................... + sub v25.4S, v11.4S, v15.4S // ......................................................................*..................................................................................... + sub v3.4S, v24.4S, v16.4S // .......................................................................*.................................................................................... + mul v26.4S, v19.4S, v30.4S // ........................................................................*................................................................................... + sqrdmulh v21.4S, v19.4S, v29.4S // .........................................................................*.................................................................................. + add v2.4S, v11.4S, v15.4S // ..........................................................................*................................................................................. + sqrdmulh v15.4S, v3.4S, v31.4S // ...........................................................................*................................................................................ + mul v19.4S, v25.4S, v1.4S // ............................................................................*............................................................................... + mul v23.4S, v3.4S, v1.4S // .............................................................................*.............................................................................. + sqrdmulh v5.4S, v25.4S, v31.4S // ..............................................................................*............................................................................. + mls v4.4S, v17.4S, v8.S[0] // ...............................................................................*............................................................................ + mls v26.4S, v21.4S, v8.S[0] // ................................................................................*........................................................................... + mls v23.4S, v15.4S, v8.S[0] // .................................................................................*.......................................................................... + mls v19.4S, v5.4S, v8.S[0] // ..................................................................................*......................................................................... + add v29.4S, v24.4S, v16.4S // ...................................................................................*........................................................................ + trn2 v24.4S, v4.4S, v26.4S // ....................................................................................*....................................................................... + trn1 v18.4S, v4.4S, v26.4S // .....................................................................................*...................................................................... + trn2 v12.4S, v2.4S, v29.4S // ......................................................................................*..................................................................... + trn1 v9.4S, v19.4S, v23.4S // .......................................................................................*.................................................................... + trn2 v7.4S, v19.4S, v23.4S // ........................................................................................*................................................................... + trn1 v16.4S, v2.4S, v29.4S // .........................................................................................*.................................................................. + trn2 v4.2D, v27.2D, v18.2D // ..........................................................................................*................................................................. + trn2 v19.2D, v14.2D, v24.2D // ...........................................................................................*................................................................ + trn1 v20.2D, v27.2D, v18.2D // ............................................................................................*............................................................... + trn1 v10.2D, v14.2D, v24.2D // .............................................................................................*.............................................................. + trn1 v22.2D, v12.2D, v7.2D // ..............................................................................................*............................................................. + trn2 v1.2D, v16.2D, v9.2D // ...............................................................................................*............................................................ + trn1 v2.2D, v16.2D, v9.2D // ................................................................................................*........................................................... + trn2 v6.2D, v12.2D, v7.2D // .................................................................................................*.......................................................... + ldr q16, [x2, #32] // ..................................................................................................*......................................................... + ldr q15, [x2, #48] // ....................................................................................................*....................................................... + sub v28.4S, v4.4S, v19.4S // ......................................................................................................*..................................................... + sub v31.4S, v20.4S, v10.4S // .......................................................................................................*.................................................... + ldr q14, [x2, #16] // ........................................................................................................*................................................... + add v9.4S, v2.4S, v22.4S // ..........................................................................................................*................................................. + add v5.4S, v20.4S, v10.4S // ...........................................................................................................*................................................ + add v13.4S, v4.4S, v19.4S // ............................................................................................................*............................................... + add v0.4S, v1.4S, v6.4S // .............................................................................................................*.............................................. + sub v19.4S, v2.4S, v22.4S // ..............................................................................................................*............................................. + sub v3.4S, v1.4S, v6.4S // ...............................................................................................................*............................................ + mul v12.4S, v31.4S, v16.S[2] // ................................................................................................................*........................................... + sqrdmulh v21.4S, v28.4S, v15.S[1] // .................................................................................................................*.......................................... + mul v10.4S, v28.4S, v15.S[0] // ..................................................................................................................*......................................... + sqrdmulh v6.4S, v31.4S, v16.S[3] // ...................................................................................................................*........................................ + ldr q22, [x2], #64 // ....................................................................................................................*....................................... + sub v24.4S, v9.4S, v0.4S // ......................................................................................................................*..................................... + sub v27.4S, v5.4S, v13.4S // .......................................................................................................................*.................................... + add v2.4S, v5.4S, v13.4S // ........................................................................................................................*................................... + add v20.4S, v9.4S, v0.4S // .........................................................................................................................*.................................. + sqrdmulh v26.4S, v19.4S, v14.S[3] // ..........................................................................................................................*................................. + mul v5.4S, v3.4S, v16.S[0] // ...........................................................................................................................*................................ + mul v25.4S, v19.4S, v14.S[2] // ............................................................................................................................*............................... + sqrdmulh v7.4S, v3.4S, v16.S[1] // .............................................................................................................................*.............................. + mls v10.4S, v21.4S, v8.S[0] // ..............................................................................................................................*............................. + mls v12.4S, v6.4S, v8.S[0] // ...............................................................................................................................*............................ + sub v18.4S, v20.4S, v2.4S // ................................................................................................................................*........................... + mul v13.4S, v27.4S, v14.S[0] // .................................................................................................................................*.......................... + mul v16.4S, v24.4S, v22.S[2] // ..................................................................................................................................*......................... + sqrdmulh v30.4S, v24.4S, v22.S[3] // ...................................................................................................................................*........................ + sqrdmulh v28.4S, v27.4S, v14.S[1] // ....................................................................................................................................*....................... + mls v25.4S, v26.4S, v8.S[0] // .....................................................................................................................................*...................... + mls v5.4S, v7.4S, v8.S[0] // ......................................................................................................................................*..................... + mul v27.4S, v18.4S, v22.S[0] // .......................................................................................................................................*.................... + sub v3.4S, v12.4S, v10.4S // ........................................................................................................................................*................... + sqrdmulh v24.4S, v18.4S, v22.S[1] // .........................................................................................................................................*.................. + mls v16.4S, v30.4S, v8.S[0] // ..........................................................................................................................................*................. + mls v13.4S, v28.4S, v8.S[0] // ...........................................................................................................................................*................ + add v15.4S, v12.4S, v10.4S // ............................................................................................................................................*............... + add v6.4S, v25.4S, v5.4S // .............................................................................................................................................*.............. + mul v9.4S, v3.4S, v14.S[0] // ..............................................................................................................................................*............. + mls v27.4S, v24.4S, v8.S[0] // ...............................................................................................................................................*............ + sqrdmulh v4.4S, v3.4S, v14.S[1] // ................................................................................................................................................*........... + sub v11.4S, v25.4S, v5.4S // .................................................................................................................................................*.......... + add v5.4S, v16.4S, v13.4S // ..................................................................................................................................................*......... + add v31.4S, v20.4S, v2.4S // ...................................................................................................................................................*........ + add v26.4S, v6.4S, v15.4S // ....................................................................................................................................................*....... + str q27, [x4], #(16*4) // .....................................................................................................................................................*...... + mul v21.4S, v11.4S, v22.S[2] // ......................................................................................................................................................*..... + str q5, [x3, #32] // .......................................................................................................................................................*.... + mls v9.4S, v4.4S, v8.S[0] // ........................................................................................................................................................*... + str q26, [x3, #16] // .........................................................................................................................................................*.. + sqrdmulh v23.4S, v11.4S, v22.S[3] // ..........................................................................................................................................................*. + str q31, [x3], #(16*4) // ...........................................................................................................................................................* + + // -------------------------------------------------------------------- cycle (expected) ---------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----- + // ldr q20, [x1, #80] // ..........................................*................................................................................................................. + // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x3] // ...............*............................................................................................................................................ + // ldr q14, [x1, #160] // .............................*.............................................................................................................................. + // sub v25.4S, v11.4S, v12.4S // ......................................*..................................................................................................................... + // ldr q21, [x1, #64] // ............................................*............................................................................................................... + // sqrdmulh v16.4S, v25.4S, v20.4S // ...................................................*........................................................................................................ + // add v3.4S, v11.4S, v12.4S // .......................................*.................................................................................................................... + // add v31.4S, v9.4S, v10.4S // .....................................*...................................................................................................................... + // sub v27.4S, v9.4S, v10.4S // ...................................*........................................................................................................................ + // ldr q26, [x1, #176] // ................................*........................................................................................................................... + // mul v25.4S, v25.4S, v21.4S // .............................................................*.............................................................................................. + // sub v17.4S, v31.4S, v3.4S // ......................................................................*..................................................................................... + // ld4 {v10.4S, v11.4S, v12.4S, v13.4S}, [x4] // *........................................................................................................................................................... + // mls v25.4S, v16.4S, v8.S[0] // ..................................................................*......................................................................................... + // add v1.4S, v31.4S, v3.4S // ..........................................................................*................................................................................. + // sub v16.4S, v12.4S, v13.4S // ..............*............................................................................................................................................. + // ldr q22, [x1, #32] // ..............................................*............................................................................................................. + // mul v2.4S, v16.4S, v14.4S // ..................................*......................................................................................................................... + // ldr q15, [x1, #128] // ...........................*................................................................................................................................ + // sqrdmulh v14.4S, v16.4S, v26.4S // ....................................*....................................................................................................................... + // ldr q9, [x1, #144] // .........*.................................................................................................................................................. + // ldr q6, [x1, #48] // ........................................*................................................................................................................... + // mls v2.4S, v14.4S, v8.S[0] // ..................................................*......................................................................................................... + // mul v18.4S, v27.4S, v22.4S // ..............................................................*............................................................................................. + // sqrdmulh v23.4S, v27.4S, v6.4S // .................................................*.......................................................................................................... + // add v0.4S, v12.4S, v13.4S // .............*.............................................................................................................................................. + // ldr q20, [x1], #(12*16) // ....................................................................*....................................................................................... + // add v21.4S, v10.4S, v11.4S // ...........*................................................................................................................................................ + // ldr q24, [x1, #-96] // ....................................................*....................................................................................................... + // sub v29.4S, v10.4S, v11.4S // ............*............................................................................................................................................... + // ldr q10, [x1, #-80] // .......................................................*.................................................................................................... + // ldr q19, [x1, #-176] // ................................................................*........................................................................................... + // mls v18.4S, v23.4S, v8.S[0] // ...................................................................*........................................................................................ + // sqrdmulh v4.4S, v29.4S, v9.4S // ........................*................................................................................................................................... + // sqrdmulh v11.4S, v17.4S, v19.4S // ..............................................................................*............................................................................. + // mul v27.4S, v17.4S, v20.4S // ............................................................................*............................................................................... + // mul v22.4S, v29.4S, v15.4S // ...............................*............................................................................................................................ + // sub v9.4S, v18.4S, v25.4S // .......................................................................*.................................................................................... + // ldr q13, [x2, #32] // ..................................................................................................*......................................................... + // sqrdmulh v3.4S, v9.4S, v19.4S // ...........................................................................*................................................................................ + // mls v27.4S, v11.4S, v8.S[0] // ..................................................................................*......................................................................... + // sub v17.4S, v21.4S, v0.4S // ..........................*................................................................................................................................. + // add v7.4S, v21.4S, v0.4S // .........................*.................................................................................................................................. + // mul v29.4S, v9.4S, v20.4S // .............................................................................*.............................................................................. + // ldr q12, [x2, #48] // ....................................................................................................*....................................................... + // mul v30.4S, v17.4S, v24.4S // ...........................................................*................................................................................................ + // mls v22.4S, v4.4S, v8.S[0] // ................................................*........................................................................................................... + // ldr q9, [x2, #16] // ........................................................................................................*................................................... + // mls v29.4S, v3.4S, v8.S[0] // .................................................................................*.......................................................................... + // sub v20.4S, v22.4S, v2.4S // ...............................................................*............................................................................................ + // add v5.4S, v18.4S, v25.4S // ...................................................................................*........................................................................ + // add v25.4S, v22.4S, v2.4S // ......................................................*..................................................................................................... + // mul v6.4S, v20.4S, v24.4S // ........................................................................*................................................................................... + // trn1 v28.4S, v27.4S, v29.4S // .......................................................................................*.................................................................... + // sqrdmulh v14.4S, v17.4S, v10.4S // ............................................................*............................................................................................... + // trn1 v0.4S, v7.4S, v25.4S // ..........................................................*................................................................................................. + // trn1 v4.4S, v1.4S, v5.4S // .........................................................................................*.................................................................. + // trn2 v17.4S, v1.4S, v5.4S // ......................................................................................*..................................................................... + // mls v30.4S, v14.4S, v8.S[0] // ...............................................................................*............................................................................ + // trn2 v19.4S, v27.4S, v29.4S // ........................................................................................*................................................................... + // sqrdmulh v21.4S, v20.4S, v10.4S // .........................................................................*.................................................................................. + // ldr q22, [x2], #64 // ....................................................................................................................*....................................... + // trn2 v24.2D, v17.2D, v19.2D // .................................................................................................*.......................................................... + // trn1 v26.2D, v17.2D, v19.2D // ..............................................................................................*............................................................. + // trn2 v11.4S, v7.4S, v25.4S // .........................................................*.................................................................................................. + // trn1 v14.2D, v4.2D, v28.2D // ................................................................................................*........................................................... + // trn2 v23.2D, v4.2D, v28.2D // ...............................................................................................*............................................................ + // sub v25.4S, v14.4S, v26.4S // ..............................................................................................................*............................................. + // add v14.4S, v14.4S, v26.4S // ..........................................................................................................*................................................. + // add v3.4S, v23.4S, v24.4S // .............................................................................................................*.............................................. + // sqrdmulh v7.4S, v25.4S, v9.S[3] // ..........................................................................................................................*................................. + // mls v6.4S, v21.4S, v8.S[0] // ................................................................................*........................................................................... + // sub v5.4S, v23.4S, v24.4S // ...............................................................................................................*............................................ + // add v26.4S, v14.4S, v3.4S // .........................................................................................................................*.................................. + // mul v21.4S, v25.4S, v9.S[2] // ............................................................................................................................*............................... + // trn2 v20.4S, v30.4S, v6.4S // ....................................................................................*....................................................................... + // mul v18.4S, v5.4S, v13.S[0] // ...........................................................................................................................*................................ + // trn1 v28.4S, v30.4S, v6.4S // .....................................................................................*...................................................................... + // trn2 v31.2D, v11.2D, v20.2D // ...........................................................................................*................................................................ + // mls v21.4S, v7.4S, v8.S[0] // .....................................................................................................................................*...................... + // trn1 v16.2D, v0.2D, v28.2D // ............................................................................................*............................................................... + // sub v17.4S, v14.4S, v3.4S // ......................................................................................................................*..................................... + // sqrdmulh v5.4S, v5.4S, v13.S[1] // .............................................................................................................................*.............................. + // trn1 v2.2D, v11.2D, v20.2D // .............................................................................................*.............................................................. + // sqrdmulh v20.4S, v17.4S, v22.S[3] // ...................................................................................................................................*........................ + // add v30.4S, v16.4S, v2.4S // ...........................................................................................................*................................................ + // trn2 v29.2D, v0.2D, v28.2D // ..........................................................................................*................................................................. + // sub v15.4S, v16.4S, v2.4S // .......................................................................................................*.................................................... + // sub v7.4S, v29.4S, v31.4S // ......................................................................................................*..................................................... + // add v19.4S, v29.4S, v31.4S // ............................................................................................................*............................................... + // mls v18.4S, v5.4S, v8.S[0] // ......................................................................................................................................*..................... + // sqrdmulh v5.4S, v15.4S, v13.S[3] // ...................................................................................................................*........................................ + // sqrdmulh v23.4S, v7.4S, v12.S[1] // .................................................................................................................*.......................................... + // add v3.4S, v30.4S, v19.4S // ........................................................................................................................*................................... + // mul v16.4S, v17.4S, v22.S[2] // ..................................................................................................................................*......................... + // mul v14.4S, v7.4S, v12.S[0] // ..................................................................................................................*......................................... + // sub v27.4S, v30.4S, v19.4S // .......................................................................................................................*.................................... + // add v6.4S, v21.4S, v18.4S // .............................................................................................................................................*.............. + // mls v16.4S, v20.4S, v8.S[0] // ..........................................................................................................................................*................. + // sqrdmulh v30.4S, v27.4S, v9.S[1] // ....................................................................................................................................*....................... + // mls v14.4S, v23.4S, v8.S[0] // ..............................................................................................................................*............................. + // sub v24.4S, v21.4S, v18.4S // .................................................................................................................................................*.......... + // add v20.4S, v26.4S, v3.4S // ...................................................................................................................................................*........ + // mul v19.4S, v15.4S, v13.S[2] // ................................................................................................................*........................................... + // sub v11.4S, v26.4S, v3.4S // ................................................................................................................................*........................... + // sqrdmulh v23.4S, v24.4S, v22.S[3] // ..........................................................................................................................................................*. + // mul v13.4S, v27.4S, v9.S[0] // .................................................................................................................................*.......................... + // sqrdmulh v4.4S, v11.4S, v22.S[1] // .........................................................................................................................................*.................. + // mls v19.4S, v5.4S, v8.S[0] // ...............................................................................................................................*............................ + // mul v21.4S, v24.4S, v22.S[2] // ......................................................................................................................................................*..... + // str q20, [x3], #(16*4) // ...........................................................................................................................................................* + // mul v11.4S, v11.4S, v22.S[0] // .......................................................................................................................................*.................... + // sub v20.4S, v19.4S, v14.4S // ........................................................................................................................................*................... + // add v15.4S, v19.4S, v14.4S // ............................................................................................................................................*............... + // mls v13.4S, v30.4S, v8.S[0] // ...........................................................................................................................................*................ + // sqrdmulh v3.4S, v20.4S, v9.S[1] // ................................................................................................................................................*........... + // add v2.4S, v6.4S, v15.4S // ....................................................................................................................................................*....... + // mls v11.4S, v4.4S, v8.S[0] // ...............................................................................................................................................*............ + // mul v9.4S, v20.4S, v9.S[0] // ..............................................................................................................................................*............. + // str q2, [x3, #-48] // .........................................................................................................................................................*.. + // add v2.4S, v16.4S, v13.4S // ..................................................................................................................................................*......... + // str q11, [x4], #(16*4) // .....................................................................................................................................................*...... + // mls v9.4S, v3.4S, v8.S[0] // ........................................................................................................................................................*... + // str q2, [x3, #-32] // .......................................................................................................................................................*.... + + sub count, count, #1 layer45678_start: - ld4 {data0.4S, data1.4S, data2.4S, data3.4S}, [inp] - ld4 {data4.4S, data5.4S, data6.4S, data7.4S}, [inpp] - - load_roots_78_part1 - - // Layer 8 Part 1 - gs_butterfly_v data0, data1, root1, root1_tw - gs_butterfly_v data2, data3, root2, root2_tw - // Layer 7 Part 1 - gs_butterfly_v data0, data2, root0, root0_tw - gs_butterfly_v data1, data3, root0, root0_tw - - load_roots_78_part2 - - // Layer 8 Part 2 - gs_butterfly_v data4, data5, root1, root1_tw - gs_butterfly_v data6, data7, root2, root2_tw - // Layer 7 Part 2 - gs_butterfly_v data4, data6, root0, root0_tw - gs_butterfly_v data5, data7, root0, root0_tw - - transpose4 data0, data1, data2, data3 - transpose4 data4, data5, data6, data7 - - load_roots_456 - - // Layer 6 - gs_butterfly data0, data1, root1, 2, 3 - gs_butterfly data2, data3, root2, 0, 1 - gs_butterfly data4, data5, root2, 2, 3 - gs_butterfly data6, data7, root3, 0, 1 - - // Layer 5 - gs_butterfly data0, data2, root0, 2, 3 - gs_butterfly data1, data3, root0, 2, 3 - gs_butterfly data4, data6, root1, 0, 1 - gs_butterfly data5, data7, root1, 0, 1 - - // Layer 4 - gs_butterfly data0, data4, root0, 0, 1 - gs_butterfly data1, data5, root0, 0, 1 - gs_butterfly data2, data6, root0, 0, 1 - gs_butterfly data3, data7, root0, 0, 1 - - // Standard way using vector instructions - - str q_data0, [inp], #(16*4) - str q_data1, [inp, #(-16*4 + 1*16)] - str q_data2, [inp, #(-16*4 + 2*16)] - str q_data3, [inp, #(-16*4 + 3*16)] - - str q_data4, [inpp], #(16*4) - str q_data5, [inpp, #(-16*4 + 1*16)] - str q_data6, [inpp, #(-16*4 + 2*16)] - str q_data7, [inpp, #(-16*4 + 3*16)] - - add inp, inp, #64 - add inpp, inpp, #64 - - subs count, count, #1 + // Instructions: 144 + // Expected cycles: 174 + // Expected IPC: 0.83 + // + // Cycle bound: 174.0 + // IPC bound: 0.83 + // + // Wall time: 414.13s + // User time: 414.13s + // + // ----------------------------------------------------------------------------- cycle (expected) ------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------------- + mls v21.4S, v23.4S, v8.S[0] // *............................................................................................................................................................................. + ldr q20, [x1, #80] // .e............................................................................................................................................................................ + sub v18.4S, v16.4S, v13.4S // ...*.......................................................................................................................................................................... + add v31.4S, v21.4S, v9.4S // ....*......................................................................................................................................................................... + sub v30.4S, v21.4S, v9.4S // .....*........................................................................................................................................................................ + mul v2.4S, v18.4S, v22.S[0] // ......*....................................................................................................................................................................... + str q31, [x3, #-16] // .......*...................................................................................................................................................................... + add x3, x3, #64 // .......*...................................................................................................................................................................... + ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x3] // ........e..................................................................................................................................................................... + ldr q14, [x1, #160] // .................e............................................................................................................................................................ + sub v25.4S, v11.4S, v12.4S // ...................e.......................................................................................................................................................... + ldr q21, [x1, #64] // ....................e......................................................................................................................................................... + sqrdmulh v16.4S, v25.4S, v20.4S // ......................e....................................................................................................................................................... + sub v24.4S, v6.4S, v15.4S // .......................*...................................................................................................................................................... + sqrdmulh v6.4S, v18.4S, v22.S[1] // ........................*..................................................................................................................................................... + mul v0.4S, v30.4S, v22.S[0] // .........................*.................................................................................................................................................... + sqrdmulh v23.4S, v30.4S, v22.S[1] // ..........................*................................................................................................................................................... + add v3.4S, v11.4S, v12.4S // ...........................e.................................................................................................................................................. + mls v2.4S, v6.4S, v8.S[0] // ............................*................................................................................................................................................. + sqrdmulh v20.4S, v24.4S, v22.S[1] // .............................*................................................................................................................................................ + add v31.4S, v9.4S, v10.4S // ..............................e............................................................................................................................................... + sub v27.4S, v9.4S, v10.4S // ...............................e.............................................................................................................................................. + str q2, [x4, #-32] // ................................*............................................................................................................................................. + mul v29.4S, v24.4S, v22.S[0] // .................................*............................................................................................................................................ + mls v0.4S, v23.4S, v8.S[0] // ..................................*........................................................................................................................................... + ldr q26, [x1, #176] // ...................................e.......................................................................................................................................... + mls v29.4S, v20.4S, v8.S[0] // .....................................*........................................................................................................................................ + mul v25.4S, v25.4S, v21.4S // ......................................e....................................................................................................................................... + str q0, [x4, #-16] // .......................................*...................................................................................................................................... + sub v17.4S, v31.4S, v3.4S // ........................................e..................................................................................................................................... + str q29, [x4, #-48] // .........................................*.................................................................................................................................... + add x4, x4, #64 // .........................................*.................................................................................................................................... + ld4 {v10.4S, v11.4S, v12.4S, v13.4S}, [x4] // ..........................................e................................................................................................................................... + mls v25.4S, v16.4S, v8.S[0] // ...................................................e.......................................................................................................................... + add v1.4S, v31.4S, v3.4S // ....................................................e......................................................................................................................... + sub v16.4S, v12.4S, v13.4S // .....................................................e........................................................................................................................ + ldr q22, [x1, #32] // ......................................................e....................................................................................................................... + mul v2.4S, v16.4S, v14.4S // ........................................................e..................................................................................................................... + ldr q15, [x1, #128] // .........................................................e.................................................................................................................... + sqrdmulh v14.4S, v16.4S, v26.4S // ...........................................................e.................................................................................................................. + ldr q9, [x1, #144] // ............................................................e................................................................................................................. + ldr q6, [x1, #48] // ..............................................................e............................................................................................................... + mls v2.4S, v14.4S, v8.S[0] // ................................................................e............................................................................................................. + mul v18.4S, v27.4S, v22.4S // .................................................................e............................................................................................................ + sqrdmulh v23.4S, v27.4S, v6.4S // ..................................................................e........................................................................................................... + add v0.4S, v12.4S, v13.4S // ...................................................................e.......................................................................................................... + ldr q20, [x1], #(12*16) // ....................................................................e......................................................................................................... + add v21.4S, v10.4S, v11.4S // ......................................................................e....................................................................................................... + ldr q24, [x1, #-96] // .......................................................................e...................................................................................................... + sub v29.4S, v10.4S, v11.4S // .........................................................................e.................................................................................................... + ldr q10, [x1, #-80] // ..........................................................................e................................................................................................... + ldr q19, [x1, #-176] // ............................................................................e................................................................................................. + mls v18.4S, v23.4S, v8.S[0] // ..............................................................................e............................................................................................... + sqrdmulh v4.4S, v29.4S, v9.4S // ...............................................................................e.............................................................................................. + sqrdmulh v11.4S, v17.4S, v19.4S // ................................................................................e............................................................................................. + mul v27.4S, v17.4S, v20.4S // .................................................................................e............................................................................................ + mul v22.4S, v29.4S, v15.4S // ..................................................................................e........................................................................................... + sub v9.4S, v18.4S, v25.4S // ...................................................................................e.......................................................................................... + ldr q13, [x2, #32] // ....................................................................................e......................................................................................... + sqrdmulh v3.4S, v9.4S, v19.4S // ......................................................................................e....................................................................................... + mls v27.4S, v11.4S, v8.S[0] // .......................................................................................e...................................................................................... + sub v17.4S, v21.4S, v0.4S // ........................................................................................e..................................................................................... + add v7.4S, v21.4S, v0.4S // .........................................................................................e.................................................................................... + mul v29.4S, v9.4S, v20.4S // ..........................................................................................e................................................................................... + ldr q12, [x2, #48] // ...........................................................................................e.................................................................................. + mul v30.4S, v17.4S, v24.4S // .............................................................................................e................................................................................ + mls v22.4S, v4.4S, v8.S[0] // ..............................................................................................e............................................................................... + ldr q9, [x2, #16] // ...............................................................................................e.............................................................................. + mls v29.4S, v3.4S, v8.S[0] // .................................................................................................e............................................................................ + sub v20.4S, v22.4S, v2.4S // ..................................................................................................e........................................................................... + add v5.4S, v18.4S, v25.4S // ...................................................................................................e.......................................................................... + add v25.4S, v22.4S, v2.4S // ....................................................................................................e......................................................................... + mul v6.4S, v20.4S, v24.4S // .....................................................................................................e........................................................................ + trn1 v28.4S, v27.4S, v29.4S // ......................................................................................................e....................................................................... + sqrdmulh v14.4S, v17.4S, v10.4S // .......................................................................................................e...................................................................... + trn1 v0.4S, v7.4S, v25.4S // ........................................................................................................e..................................................................... + trn1 v4.4S, v1.4S, v5.4S // .........................................................................................................e.................................................................... + trn2 v17.4S, v1.4S, v5.4S // ..........................................................................................................e................................................................... + mls v30.4S, v14.4S, v8.S[0] // ...........................................................................................................e.................................................................. + trn2 v19.4S, v27.4S, v29.4S // ............................................................................................................e................................................................. + sqrdmulh v21.4S, v20.4S, v10.4S // .............................................................................................................e................................................................ + ldr q22, [x2], #64 // ..............................................................................................................e............................................................... + trn2 v24.2D, v17.2D, v19.2D // ................................................................................................................e............................................................. + trn1 v26.2D, v17.2D, v19.2D // .................................................................................................................e............................................................ + trn2 v11.4S, v7.4S, v25.4S // ..................................................................................................................e........................................................... + trn1 v14.2D, v4.2D, v28.2D // ...................................................................................................................e.......................................................... + trn2 v23.2D, v4.2D, v28.2D // ....................................................................................................................e......................................................... + sub v25.4S, v14.4S, v26.4S // .....................................................................................................................e........................................................ + add v14.4S, v14.4S, v26.4S // ......................................................................................................................e....................................................... + add v3.4S, v23.4S, v24.4S // .......................................................................................................................e...................................................... + sqrdmulh v7.4S, v25.4S, v9.S[3] // ........................................................................................................................e..................................................... + mls v6.4S, v21.4S, v8.S[0] // .........................................................................................................................e.................................................... + sub v5.4S, v23.4S, v24.4S // ..........................................................................................................................e................................................... + add v26.4S, v14.4S, v3.4S // ...........................................................................................................................e.................................................. + mul v21.4S, v25.4S, v9.S[2] // ............................................................................................................................e................................................. + trn2 v20.4S, v30.4S, v6.4S // .............................................................................................................................e................................................ + mul v18.4S, v5.4S, v13.S[0] // ..............................................................................................................................e............................................... + trn1 v28.4S, v30.4S, v6.4S // ...............................................................................................................................e.............................................. + trn2 v31.2D, v11.2D, v20.2D // ................................................................................................................................e............................................. + mls v21.4S, v7.4S, v8.S[0] // .................................................................................................................................e............................................ + trn1 v16.2D, v0.2D, v28.2D // ..................................................................................................................................e........................................... + sub v17.4S, v14.4S, v3.4S // ...................................................................................................................................e.......................................... + sqrdmulh v5.4S, v5.4S, v13.S[1] // ....................................................................................................................................e......................................... + trn1 v2.2D, v11.2D, v20.2D // .....................................................................................................................................e........................................ + sqrdmulh v20.4S, v17.4S, v22.S[3] // ......................................................................................................................................e....................................... + add v30.4S, v16.4S, v2.4S // .......................................................................................................................................e...................................... + trn2 v29.2D, v0.2D, v28.2D // ........................................................................................................................................e..................................... + sub v15.4S, v16.4S, v2.4S // .........................................................................................................................................e.................................... + sub v7.4S, v29.4S, v31.4S // ..........................................................................................................................................e................................... + add v19.4S, v29.4S, v31.4S // ...........................................................................................................................................e.................................. + mls v18.4S, v5.4S, v8.S[0] // ............................................................................................................................................e................................. + sqrdmulh v5.4S, v15.4S, v13.S[3] // .............................................................................................................................................e................................ + sqrdmulh v23.4S, v7.4S, v12.S[1] // ..............................................................................................................................................e............................... + add v3.4S, v30.4S, v19.4S // ...............................................................................................................................................e.............................. + mul v16.4S, v17.4S, v22.S[2] // ................................................................................................................................................e............................. + mul v14.4S, v7.4S, v12.S[0] // .................................................................................................................................................e............................ + sub v27.4S, v30.4S, v19.4S // ..................................................................................................................................................e........................... + add v6.4S, v21.4S, v18.4S // ...................................................................................................................................................e.......................... + mls v16.4S, v20.4S, v8.S[0] // ....................................................................................................................................................e......................... + sqrdmulh v30.4S, v27.4S, v9.S[1] // .....................................................................................................................................................e........................ + mls v14.4S, v23.4S, v8.S[0] // ......................................................................................................................................................e....................... + sub v24.4S, v21.4S, v18.4S // .......................................................................................................................................................e...................... + add v20.4S, v26.4S, v3.4S // ........................................................................................................................................................e..................... + mul v19.4S, v15.4S, v13.S[2] // .........................................................................................................................................................e.................... + sub v11.4S, v26.4S, v3.4S // ..........................................................................................................................................................e................... + sqrdmulh v23.4S, v24.4S, v22.S[3] // ...........................................................................................................................................................e.................. + mul v13.4S, v27.4S, v9.S[0] // ............................................................................................................................................................e................. + sqrdmulh v4.4S, v11.4S, v22.S[1] // .............................................................................................................................................................e................ + mls v19.4S, v5.4S, v8.S[0] // ..............................................................................................................................................................e............... + mul v21.4S, v24.4S, v22.S[2] // ...............................................................................................................................................................e.............. + str q20, [x3], #(16*4) // ................................................................................................................................................................e............. + mul v11.4S, v11.4S, v22.S[0] // .................................................................................................................................................................e............ + sub v20.4S, v19.4S, v14.4S // ..................................................................................................................................................................e........... + add v15.4S, v19.4S, v14.4S // ...................................................................................................................................................................e.......... + mls v13.4S, v30.4S, v8.S[0] // ....................................................................................................................................................................e......... + sqrdmulh v3.4S, v20.4S, v9.S[1] // .....................................................................................................................................................................e........ + add v2.4S, v6.4S, v15.4S // ......................................................................................................................................................................e....... + mls v11.4S, v4.4S, v8.S[0] // .......................................................................................................................................................................e...... + mul v9.4S, v20.4S, v9.S[0] // ........................................................................................................................................................................e..... + str q2, [x3, #-48] // .........................................................................................................................................................................e.... + add v2.4S, v16.4S, v13.4S // ..........................................................................................................................................................................e... + str q11, [x4], #(16*4) // ...........................................................................................................................................................................e.. + mls v9.4S, v3.4S, v8.S[0] // ............................................................................................................................................................................e. + str q2, [x3, #-32] // .............................................................................................................................................................................e + + // -------------------------------------------------------------------------------------------------- cycle (expected) --------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|-------------- + // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x3] // .......e.....................................................................................................................................................................'.......~................................. + // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x4] // .........................................e...................................................................................................................................'......................................... + // ldr q0, [x1], #(12*16) // ...................................................................e.........................................................................................................'......................................... + // ldr q4, [x1, #(-12*16 + 1*16)] // ...........................................................................e.................................................................................................'......................................... + // ldr q1, [x1, #(-12*16 + 2*16)] // .....................................................e.......................................................................................................................'......................................... + // ldr q5, [x1, #(-12*16 + 3*16)] // .............................................................e...............................................................................................................'......................................... + // ldr q2, [x1, #(-12*16 + 4*16)] // ...................e.........................................................................................................................................................'...................~..................... + // ldr q6, [x1, #(-12*16 + 5*16)] // e............................................................................................................................................................................'~........................................ + // sub v24.4s, v9.4s, v10.4s // ..............................e..............................................................................................................................................'..............................~.......... + // add v9.4s, v9.4s, v10.4s // .............................e...............................................................................................................................................'.............................~........... + // sqrdmulh v27.4s, v24.4s, v5.4s // .................................................................e...........................................................................................................'......................................... + // mul v10.4s, v24.4s, v1.4s // ................................................................e............................................................................................................'......................................... + // mls v10.4s, v27.4s, v8.s[0] // .............................................................................e...............................................................................................'......................................... + // sub v24.4s, v11.4s, v12.4s // ..................e..........................................................................................................................................................'..................~...................... + // add v11.4s, v11.4s, v12.4s // ..........................e..................................................................................................................................................'..........................~.............. + // sqrdmulh v27.4s, v24.4s, v6.4s // .....................e.......................................................................................................................................................'.....................~................... + // mul v12.4s, v24.4s, v2.4s // .....................................e.......................................................................................................................................'.....................................~... + // mls v12.4s, v27.4s, v8.s[0] // ..................................................e..........................................................................................................................'......................................... + // sub v24.4s, v9.4s, v11.4s // .......................................e.....................................................................................................................................'.......................................~. + // add v9.4s, v9.4s, v11.4s // ...................................................e.........................................................................................................................'......................................... + // sqrdmulh v27.4s, v24.4s, v4.4s // ...............................................................................e.............................................................................................'......................................... + // mul v11.4s, v24.4s, v0.4s // ................................................................................e............................................................................................'......................................... + // mls v11.4s, v27.4s, v8.s[0] // ......................................................................................e......................................................................................'......................................... + // sub v24.4s, v10.4s, v12.4s // ..................................................................................e..........................................................................................'......................................... + // add v10.4s, v10.4s, v12.4s // ..................................................................................................e..........................................................................'......................................... + // sqrdmulh v27.4s, v24.4s, v4.4s // .....................................................................................e.......................................................................................'......................................... + // mul v12.4s, v24.4s, v0.4s // .........................................................................................e...................................................................................'......................................... + // mls v12.4s, v27.4s, v8.s[0] // ................................................................................................e............................................................................'......................................... + // ldr q0, [x1, #(-12*16 + 6*16)] // ......................................................................e......................................................................................................'......................................... + // ldr q4, [x1, #(-12*16 + 7*16)] // .........................................................................e...................................................................................................'......................................... + // ldr q1, [x1, #(-12*16 + 8*16)] // ........................................................e....................................................................................................................'......................................... + // ldr q5, [x1, #(-12*16 + 9*16)] // ...........................................................e.................................................................................................................'......................................... + // ldr q2, [x1, #(-12*16 + 10*16)] // ................e............................................................................................................................................................'................~........................ + // ldr q6, [x1, #(-12*16 + 11*16)] // ..................................e..........................................................................................................................................'..................................~...... + // sub v24.4s, v13.4s, v14.4s // ........................................................................e....................................................................................................'......................................... + // add v13.4s, v13.4s, v14.4s // .....................................................................e.......................................................................................................'......................................... + // sqrdmulh v27.4s, v24.4s, v5.4s // ..............................................................................e..............................................................................................'......................................... + // mul v14.4s, v24.4s, v1.4s // .................................................................................e...........................................................................................'......................................... + // mls v14.4s, v27.4s, v8.s[0] // .............................................................................................e...............................................................................'......................................... + // sub v24.4s, v15.4s, v16.4s // ....................................................e........................................................................................................................'......................................... + // add v15.4s, v15.4s, v16.4s // ..................................................................e..........................................................................................................'......................................... + // sqrdmulh v27.4s, v24.4s, v6.4s // ..........................................................e..................................................................................................................'......................................... + // mul v16.4s, v24.4s, v2.4s // .......................................................e.....................................................................................................................'......................................... + // mls v16.4s, v27.4s, v8.s[0] // ...............................................................e.............................................................................................................'......................................... + // sub v24.4s, v13.4s, v15.4s // .......................................................................................e.....................................................................................'......................................... + // add v13.4s, v13.4s, v15.4s // ........................................................................................e....................................................................................'......................................... + // sqrdmulh v27.4s, v24.4s, v4.4s // ......................................................................................................e......................................................................'......................................... + // mul v15.4s, v24.4s, v0.4s // ............................................................................................e................................................................................'......................................... + // mls v15.4s, v27.4s, v8.s[0] // ..........................................................................................................e..................................................................'......................................... + // sub v24.4s, v14.4s, v16.4s // .................................................................................................e...........................................................................'......................................... + // add v14.4s, v14.4s, v16.4s // ...................................................................................................e.........................................................................'......................................... + // sqrdmulh v27.4s, v24.4s, v4.4s // ............................................................................................................e................................................................'......................................... + // mul v16.4s, v24.4s, v0.4s // ....................................................................................................e........................................................................'......................................... + // mls v16.4s, v27.4s, v8.s[0] // ........................................................................................................................e....................................................'......................................... + // trn1 v25.4s, v9.4s, v10.4s // ........................................................................................................e....................................................................'......................................... + // trn2 v26.4s, v9.4s, v10.4s // .........................................................................................................e...................................................................'......................................... + // trn1 v27.4s, v11.4s, v12.4s // .....................................................................................................e.......................................................................'......................................... + // trn2 v28.4s, v11.4s, v12.4s // ...........................................................................................................e.................................................................'......................................... + // trn2 v11.2d, v25.2d, v27.2d // ...................................................................................................................e.........................................................'......................................... + // trn2 v12.2d, v26.2d, v28.2d // ...............................................................................................................e.............................................................'......................................... + // trn1 v9.2d, v25.2d, v27.2d // ..................................................................................................................e..........................................................'......................................... + // trn1 v10.2d, v26.2d, v28.2d // ................................................................................................................e............................................................'......................................... + // trn1 v25.4s, v13.4s, v14.4s // .......................................................................................................e.....................................................................'......................................... + // trn2 v26.4s, v13.4s, v14.4s // .................................................................................................................e...........................................................'......................................... + // trn1 v27.4s, v15.4s, v16.4s // ..............................................................................................................................e..............................................'......................................... + // trn2 v28.4s, v15.4s, v16.4s // ............................................................................................................................e................................................'......................................... + // trn2 v15.2d, v25.2d, v27.2d // .......................................................................................................................................e.....................................'......................................... + // trn2 v16.2d, v26.2d, v28.2d // ...............................................................................................................................e.............................................'......................................... + // trn1 v13.2d, v25.2d, v27.2d // .................................................................................................................................e...........................................'......................................... + // trn1 v14.2d, v26.2d, v28.2d // ....................................................................................................................................e........................................'......................................... + // ldr q0, [x2], #64 // .............................................................................................................e...............................................................'......................................... + // ldr q1, [x2, #(-64 + 16)] // ..............................................................................................e..............................................................................'......................................... + // ldr q2, [x2, #(-64 + 32)] // ...................................................................................e.........................................................................................'......................................... + // ldr q3, [x2, #(-64 + 48)] // ..........................................................................................e..................................................................................'......................................... + // sub v24.4s, v9.4s, v10.4s // ....................................................................................................................e........................................................'......................................... + // add v9.4s, v9.4s, v10.4s // .....................................................................................................................e.......................................................'......................................... + // sqrdmulh v27.4s, v24.4s, v1.s[3] // .......................................................................................................................e.....................................................'......................................... + // mul v10.4s, v24.4s, v1.s[2] // ...........................................................................................................................e.................................................'......................................... + // mls v10.4s, v27.4s, v8.s[0] // ................................................................................................................................e............................................'......................................... + // sub v24.4s, v11.4s, v12.4s // .........................................................................................................................e...................................................'......................................... + // add v11.4s, v11.4s, v12.4s // ......................................................................................................................e......................................................'......................................... + // sqrdmulh v27.4s, v24.4s, v2.s[1] // ...................................................................................................................................e.........................................'......................................... + // mul v12.4s, v24.4s, v2.s[0] // .............................................................................................................................e...............................................'......................................... + // mls v12.4s, v27.4s, v8.s[0] // ...........................................................................................................................................e.................................'......................................... + // sub v24.4s, v13.4s, v14.4s // ........................................................................................................................................e....................................'......................................... + // add v13.4s, v13.4s, v14.4s // ......................................................................................................................................e......................................'......................................... + // sqrdmulh v27.4s, v24.4s, v2.s[3] // ............................................................................................................................................e................................'......................................... + // mul v14.4s, v24.4s, v2.s[2] // ........................................................................................................................................................e....................'......................................... + // mls v14.4s, v27.4s, v8.s[0] // .............................................................................................................................................................e...............'......................................... + // sub v24.4s, v15.4s, v16.4s // .........................................................................................................................................e...................................'......................................... + // add v15.4s, v15.4s, v16.4s // ..........................................................................................................................................e..................................'......................................... + // sqrdmulh v27.4s, v24.4s, v3.s[1] // .............................................................................................................................................e...............................'......................................... + // mul v16.4s, v24.4s, v3.s[0] // ................................................................................................................................................e............................'......................................... + // mls v16.4s, v27.4s, v8.s[0] // .....................................................................................................................................................e.......................'......................................... + // sub v24.4s, v9.4s, v11.4s // ..................................................................................................................................e..........................................'......................................... + // add v9.4s, v9.4s, v11.4s // ..........................................................................................................................e..................................................'......................................... + // sqrdmulh v27.4s, v24.4s, v0.s[3] // .....................................................................................................................................e.......................................'......................................... + // mul v11.4s, v24.4s, v0.s[2] // ...............................................................................................................................................e.............................'......................................... + // mls v11.4s, v27.4s, v8.s[0] // ...................................................................................................................................................e.........................'......................................... + // sub v24.4s, v10.4s, v12.4s // ......................................................................................................................................................e......................'......................................... + // add v10.4s, v10.4s, v12.4s // ..................................................................................................................................................e..........................'......................................... + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ..........................................................................................................................................................e..................'......................................... + // mul v12.4s, v24.4s, v0.s[2] // ..............................................................................................................................................................e..............'......................................... + // mls v12.4s, v27.4s, v8.s[0] // .............................................................................................................................................................................*......................................... + // sub v24.4s, v13.4s, v15.4s // .................................................................................................................................................e...........................'......................................... + // add v13.4s, v13.4s, v15.4s // ..............................................................................................................................................e..............................'......................................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ....................................................................................................................................................e........................'......................................... + // mul v15.4s, v24.4s, v1.s[0] // ...........................................................................................................................................................e.................'......................................... + // mls v15.4s, v27.4s, v8.s[0] // ...................................................................................................................................................................e.........'......................................... + // sub v24.4s, v14.4s, v16.4s // .................................................................................................................................................................e...........'......................................... + // add v14.4s, v14.4s, v16.4s // ..................................................................................................................................................................e..........'......................................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ....................................................................................................................................................................e........'......................................... + // mul v16.4s, v24.4s, v1.s[0] // .......................................................................................................................................................................e.....'......................................... + // mls v16.4s, v27.4s, v8.s[0] // ...........................................................................................................................................................................e.'......................................... + // sub v24.4s, v9.4s, v13.4s // .........................................................................................................................................................e...................'......................................... + // add v9.4s, v9.4s, v13.4s // .......................................................................................................................................................e.....................'......................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ............................................................................................................................................................e................'......................................... + // mul v13.4s, v24.4s, v0.s[0] // ................................................................................................................................................................e............'......................................... + // mls v13.4s, v27.4s, v8.s[0] // ......................................................................................................................................................................e......'......................................... + // sub v24.4s, v10.4s, v14.4s // ......................~......................................................................................................................................................'......................*.................. + // add v10.4s, v10.4s, v14.4s // .....................................................................................................................................................................e.......'......................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ............................~................................................................................................................................................'............................*............ + // mul v14.4s, v24.4s, v0.s[0] // ................................~............................................................................................................................................'................................*........ + // mls v14.4s, v27.4s, v8.s[0] // ....................................~........................................................................................................................................'....................................*.... + // sub v24.4s, v11.4s, v15.4s // ..~..........................................................................................................................................................................'..*...................................... + // add v11.4s, v11.4s, v15.4s // .........................................................................................................................................................................e...'......................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // .......................~.....................................................................................................................................................'.......................*................. + // mul v15.4s, v24.4s, v0.s[0] // .....~.......................................................................................................................................................................'.....*................................... + // mls v15.4s, v27.4s, v8.s[0] // ...........................~.................................................................................................................................................'...........................*............. + // sub v24.4s, v12.4s, v16.4s // ....~........................................................................................................................................................................'....*.................................... + // add v12.4s, v12.4s, v16.4s // ...~.........................................................................................................................................................................'...*..................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // .........................~...................................................................................................................................................'.........................*............... + // mul v16.4s, v24.4s, v0.s[0] // ........................~....................................................................................................................................................'........................*................ + // mls v16.4s, v27.4s, v8.s[0] // .................................~...........................................................................................................................................'.................................*....... + // str q9, [x3], #(16*4) // ...............................................................................................................................................................e.............'......................................... + // str q10, [x3, #(-16*4 + 1*16)] // ........................................................................................................................................................................e....'......................................... + // str q11, [x3, #(-16*4 + 2*16)] // ............................................................................................................................................................................e'......................................... + // str q12, [x3, #(-16*4 + 3*16)] // ......~......................................................................................................................................................................'......*.................................. + // str q13, [x4], #(16*4) // ..........................................................................................................................................................................e..'......................................... + // str q14, [x4, #(-16*4 + 1*16)] // ........................................~....................................................................................................................................'........................................* + // str q15, [x4, #(-16*4 + 2*16)] // ...............................~.............................................................................................................................................'...............................*......... + // str q16, [x4, #(-16*4 + 3*16)] // ......................................~......................................................................................................................................'......................................*.. + // add x3, x3, #64 // ......~......................................................................................................................................................................'......*.................................. + // add x4, x4, #64 // ........................................~....................................................................................................................................'........................................* + + sub count, count, 1 cbnz count, layer45678_start + // Instructions: 20 + // Expected cycles: 20 + // Expected IPC: 1.00 + // + // Cycle bound: 20.0 + // IPC bound: 1.00 + // + // Wall time: 0.04s + // User time: 0.04s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + sub v11.4S, v16.4S, v13.4S // *............................. + mls v21.4S, v23.4S, v8.S[0] // .*............................ + sub v19.4S, v6.4S, v15.4S // ..*........................... + sqrdmulh v16.4S, v11.4S, v22.S[1] // ...*.......................... + mul v6.4S, v11.4S, v22.S[0] // ....*......................... + sub v13.4S, v21.4S, v9.4S // .....*........................ + mul v0.4S, v19.4S, v22.S[0] // ......*....................... + sqrdmulh v15.4S, v19.4S, v22.S[1] // .......*...................... + sqrdmulh v29.4S, v13.4S, v22.S[1] // ........*..................... + mul v13.4S, v13.4S, v22.S[0] // .........*.................... + add v17.4S, v21.4S, v9.4S // ..........*................... + mls v6.4S, v16.4S, v8.S[0] // ...........*.................. + mls v0.4S, v15.4S, v8.S[0] // ............*................. + str q17, [x3, #-16] // .............*................ + mls v13.4S, v29.4S, v8.S[0] // ..............*............... + str q6, [x4, #-32] // ...............*.............. + add x3, x3, #64 // ...............*.............. + str q0, [x4, #-48] // .................*............ + str q13, [x4, #-16] // ...................*.......... + add x4, x4, #64 // ...................*.......... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // mls v21.4S, v23.4S, v8.S[0] // .*............................. + // sub v18.4S, v16.4S, v13.4S // *.............................. + // add v31.4S, v21.4S, v9.4S // ..........*.................... + // sub v30.4S, v21.4S, v9.4S // .....*......................... + // mul v2.4S, v18.4S, v22.S[0] // ....*.......................... + // str q31, [x3, #-16] // .............*................. + // add x3, x3, #64 // ...............*............... + // sub v24.4S, v6.4S, v15.4S // ..*............................ + // sqrdmulh v6.4S, v18.4S, v22.S[1] // ...*........................... + // mul v0.4S, v30.4S, v22.S[0] // .........*..................... + // sqrdmulh v23.4S, v30.4S, v22.S[1] // ........*...................... + // mls v2.4S, v6.4S, v8.S[0] // ...........*................... + // sqrdmulh v20.4S, v24.4S, v22.S[1] // .......*....................... + // str q2, [x4, #-32] // ...............*............... + // mul v29.4S, v24.4S, v22.S[0] // ......*........................ + // mls v0.4S, v23.4S, v8.S[0] // ..............*................ + // mls v29.4S, v20.4S, v8.S[0] // ............*.................. + // str q0, [x4, #-16] // ...................*........... + // str q29, [x4, #-48] // .................*............. + // add x4, x4, #64 // ...................*........... + ninv .req v25 ninv_tw .req v26 mov count, #8 - + // load ninv mov wtmp, #16382 // 2^(32 - 8) mod Q @@ -280,50 +852,592 @@ layer45678_start: load_roots_123 .p2align 2 + // Instructions: 96 + // Expected cycles: 111 + // Expected IPC: 0.86 + // + // Cycle bound: 111.0 + // IPC bound: 0.86 + // + // Wall time: 3.65s + // User time: 3.65s + // + // ---------------------------------------------- cycle (expected) ----------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|---------- + ldr q23, [x0, #640] // *.............................................................................................................. + ldr q5, [x0, #512] // ..*............................................................................................................ + ldr q17, [x0, #256] // ....*.......................................................................................................... + sub v4.4S, v5.4S, v23.4S // ......*........................................................................................................ + add v11.4S, v5.4S, v23.4S // .......*....................................................................................................... + ldr q19, [x0, #384] // ........*...................................................................................................... + ldr q14, [x0, #0] // ..........*.................................................................................................... + add v23.4S, v17.4S, v19.4S // ............*.................................................................................................. + sub v13.4S, v17.4S, v19.4S // .............*................................................................................................. + sqrdmulh v18.4S, v4.4S, v2.S[3] // ..............*................................................................................................ + mul v22.4S, v4.4S, v2.S[2] // ...............*............................................................................................... + ldr q30, [x0, #896] // ................*.............................................................................................. + ldr q29, [x0, #768] // ..................*............................................................................................ + ldr q15, [x0, #128] // ....................*.......................................................................................... + add v5.4S, v29.4S, v30.4S // ......................*........................................................................................ + sub v17.4S, v29.4S, v30.4S // .......................*....................................................................................... + add v7.4S, v14.4S, v15.4S // ........................*...................................................................................... + sub v10.4S, v11.4S, v5.4S // .........................*..................................................................................... + add v29.4S, v11.4S, v5.4S // ..........................*.................................................................................... + sub v6.4S, v14.4S, v15.4S // ...........................*................................................................................... + add v31.4S, v7.4S, v23.4S // ............................*.................................................................................. + sub v21.4S, v7.4S, v23.4S // .............................*................................................................................. + sqrdmulh v23.4S, v13.4S, v2.S[1] // ..............................*................................................................................ + mul v15.4S, v13.4S, v2.S[0] // ...............................*............................................................................... + mls v22.4S, v18.4S, v8.S[0] // ................................*.............................................................................. + mul v9.4S, v17.4S, v3.S[0] // .................................*............................................................................. + sqrdmulh v18.4S, v6.4S, v1.S[3] // ..................................*............................................................................ + mul v30.4S, v6.4S, v1.S[2] // ...................................*........................................................................... + sqrdmulh v17.4S, v17.4S, v3.S[1] // ....................................*.......................................................................... + mls v15.4S, v23.4S, v8.S[0] // .....................................*......................................................................... + sqrdmulh v7.4S, v10.4S, v1.S[1] // ......................................*........................................................................ + mls v30.4S, v18.4S, v8.S[0] // .......................................*....................................................................... + mls v9.4S, v17.4S, v8.S[0] // ........................................*...................................................................... + mul v6.4S, v21.4S, v0.S[2] // .........................................*..................................................................... + mul v23.4S, v10.4S, v1.S[0] // ..........................................*.................................................................... + sub v17.4S, v30.4S, v15.4S // ...........................................*................................................................... + sub v16.4S, v22.4S, v9.4S // ............................................*.................................................................. + sqrdmulh v14.4S, v21.4S, v0.S[3] // .............................................*................................................................. + mul v5.4S, v17.4S, v0.S[2] // ..............................................*................................................................ + mul v28.4S, v16.4S, v1.S[0] // ...............................................*............................................................... + sqrdmulh v13.4S, v17.4S, v0.S[3] // ................................................*.............................................................. + sqrdmulh v12.4S, v16.4S, v1.S[1] // .................................................*............................................................. + mls v6.4S, v14.4S, v8.S[0] // ..................................................*............................................................ + mls v23.4S, v7.4S, v8.S[0] // ...................................................*........................................................... + sub v16.4S, v31.4S, v29.4S // ....................................................*.......................................................... + add v21.4S, v31.4S, v29.4S // .....................................................*......................................................... + mls v28.4S, v12.4S, v8.S[0] // ......................................................*........................................................ + mls v5.4S, v13.4S, v8.S[0] // .......................................................*....................................................... + add v7.4S, v30.4S, v15.4S // ........................................................*...................................................... + add v11.4S, v22.4S, v9.4S // .........................................................*..................................................... + add v15.4S, v6.4S, v23.4S // ..........................................................*.................................................... + sub v4.4S, v5.4S, v28.4S // ...........................................................*................................................... + add v12.4S, v7.4S, v11.4S // ............................................................*.................................................. + sub v29.4S, v6.4S, v23.4S // .............................................................*................................................. + sqrdmulh v13.4S, v21.4S, v26.4S // ..............................................................*................................................ + sqrdmulh v23.4S, v15.4S, v26.4S // ...............................................................*............................................... + mul v24.4S, v4.4S, v0.S[0] // ................................................................*.............................................. + mul v9.4S, v16.4S, v0.S[0] // .................................................................*............................................. + mul v14.4S, v21.4S, v25.4S // ..................................................................*............................................ + mul v30.4S, v12.4S, v25.4S // ...................................................................*........................................... + sqrdmulh v6.4S, v12.4S, v26.4S // ....................................................................*.......................................... + sqrdmulh v27.4S, v16.4S, v0.S[1] // .....................................................................*......................................... + sqrdmulh v10.4S, v4.4S, v0.S[1] // ......................................................................*........................................ + mul v15.4S, v15.4S, v25.4S // .......................................................................*....................................... + ldr q19, [x0, #784] // ........................................................................*...................................... + ldr q18, [x0, #912] // ..........................................................................*.................................... + ldr q16, [x0, #528] // ............................................................................*.................................. + ldr q21, [x0, #656] // ..............................................................................*................................ + ldr q20, [x0, #16] // ................................................................................*.............................. + ldr q31, [x0, #144] // ..................................................................................*............................ + add v12.4S, v16.4S, v21.4S // ....................................................................................*.......................... + mls v24.4S, v10.4S, v8.S[0] // .....................................................................................*......................... + sqrdmulh v22.4S, v29.4S, v0.S[1] // ......................................................................................*........................ + mls v30.4S, v6.4S, v8.S[0] // .......................................................................................*....................... + mls v9.4S, v27.4S, v8.S[0] // ........................................................................................*...................... + mul v27.4S, v29.4S, v0.S[0] // .........................................................................................*..................... + add v29.4S, v19.4S, v18.4S // ..........................................................................................*.................... + mls v14.4S, v13.4S, v8.S[0] // ...........................................................................................*................... + mls v15.4S, v23.4S, v8.S[0] // ............................................................................................*.................. + sub v4.4S, v20.4S, v31.4S // .............................................................................................*................. + add v17.4S, v5.4S, v28.4S // ..............................................................................................*................ + str q30, [x0, #128] // ...............................................................................................*............... + sub v30.4S, v12.4S, v29.4S // ................................................................................................*.............. + sub v6.4S, v19.4S, v18.4S // .................................................................................................*............. + sqrdmulh v13.4S, v17.4S, v26.4S // ..................................................................................................*............ + str q9, [x0, #512] // ...................................................................................................*........... + mul v18.4S, v4.4S, v1.S[2] // ....................................................................................................*.......... + mls v27.4S, v22.4S, v8.S[0] // .....................................................................................................*......... + str q24, [x0, #896] // ......................................................................................................*........ + sub v19.4S, v7.4S, v11.4S // .......................................................................................................*....... + sub v28.4S, v16.4S, v21.4S // ........................................................................................................*...... + add v9.4S, v12.4S, v29.4S // .........................................................................................................*..... + str q14, [x0], #(16) // ..........................................................................................................*.... + ldr q5, [x0, #384] // ...........................................................................................................*... + str q15, [x0, #240] // .............................................................................................................*. + ldr q24, [x0, #256] // ..............................................................................................................* + + // ---------------------------------------------- cycle (expected) ----------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|---------- + // ldr q23, [x0, #640] // *.............................................................................................................. + // ldr q11, [x0, #512] // ..*............................................................................................................ + // add v4.4S, v11.4S, v23.4S // .......*....................................................................................................... + // sub v28.4S, v11.4S, v23.4S // ......*........................................................................................................ + // ldr q10, [x0, #768] // ..................*............................................................................................ + // ldr q6, [x0, #896] // ................*.............................................................................................. + // ldr q20, [x0, #0] // ..........*.................................................................................................... + // add v13.4S, v10.4S, v6.4S // ......................*........................................................................................ + // ldr q31, [x0, #128] // ....................*.......................................................................................... + // add v9.4S, v4.4S, v13.4S // ..........................*.................................................................................... + // sub v6.4S, v10.4S, v6.4S // .......................*....................................................................................... + // ldr q5, [x0, #384] // ........*...................................................................................................... + // ldr q24, [x0, #256] // ....*.......................................................................................................... + // sub v30.4S, v4.4S, v13.4S // .........................*..................................................................................... + // sub v4.4S, v20.4S, v31.4S // ...........................*................................................................................... + // mul v18.4S, v4.4S, v1.S[2] // ...................................*........................................................................... + // sub v10.4S, v24.4S, v5.4S // .............*................................................................................................. + // add v7.4S, v24.4S, v5.4S // ............*.................................................................................................. + // ldr q23, [x0, #656] // ..............................................................................*................................ + // sqrdmulh v4.4S, v4.4S, v1.S[3] // ..................................*............................................................................ + // add v21.4S, v20.4S, v31.4S // ........................*...................................................................................... + // ldr q11, [x0, #528] // ............................................................................*.................................. + // add v24.4S, v21.4S, v7.4S // ............................*.................................................................................. + // sub v22.4S, v21.4S, v7.4S // .............................*................................................................................. + // mul v15.4S, v10.4S, v2.S[0] // ...............................*............................................................................... + // sqrdmulh v21.4S, v10.4S, v2.S[1] // ..............................*................................................................................ + // mls v18.4S, v4.4S, v8.S[0] // .......................................*....................................................................... + // sqrdmulh v20.4S, v28.4S, v2.S[3] // ..............*................................................................................................ + // mul v16.4S, v28.4S, v2.S[2] // ...............*............................................................................................... + // add v4.4S, v11.4S, v23.4S // ....................................................................................*.......................... + // mul v14.4S, v22.4S, v0.S[2] // .........................................*..................................................................... + // mul v29.4S, v6.4S, v3.S[0] // .................................*............................................................................. + // sub v28.4S, v11.4S, v23.4S // ........................................................................................................*...... + // sqrdmulh v23.4S, v6.4S, v3.S[1] // ....................................*.......................................................................... + // mls v15.4S, v21.4S, v8.S[0] // .....................................*......................................................................... + // mls v16.4S, v20.4S, v8.S[0] // ................................*.............................................................................. + // mls v29.4S, v23.4S, v8.S[0] // ........................................*...................................................................... + // sub v21.4S, v18.4S, v15.4S // ...........................................*................................................................... + // add v12.4S, v18.4S, v15.4S // ........................................................*...................................................... + // sub v5.4S, v16.4S, v29.4S // ............................................*.................................................................. + // mul v17.4S, v21.4S, v0.S[2] // ..............................................*................................................................ + // add v18.4S, v16.4S, v29.4S // .........................................................*..................................................... + // sqrdmulh v16.4S, v30.4S, v1.S[1] // ......................................*........................................................................ + // sqrdmulh v15.4S, v5.4S, v1.S[1] // .................................................*............................................................. + // sqrdmulh v13.4S, v21.4S, v0.S[3] // ................................................*.............................................................. + // sub v21.4S, v24.4S, v9.4S // ....................................................*.......................................................... + // mul v23.4S, v5.4S, v1.S[0] // ...............................................*............................................................... + // mls v17.4S, v13.4S, v8.S[0] // .......................................................*....................................................... + // sqrdmulh v5.4S, v21.4S, v0.S[1] // .....................................................................*......................................... + // mul v19.4S, v21.4S, v0.S[0] // .................................................................*............................................. + // mls v23.4S, v15.4S, v8.S[0] // ......................................................*........................................................ + // add v11.4S, v24.4S, v9.4S // .....................................................*......................................................... + // mls v19.4S, v5.4S, v8.S[0] // ........................................................................................*...................... + // sub v13.4S, v17.4S, v23.4S // ...........................................................*................................................... + // add v17.4S, v17.4S, v23.4S // ..............................................................................................*................ + // sqrdmulh v23.4S, v22.4S, v0.S[3] // .............................................*................................................................. + // add v9.4S, v12.4S, v18.4S // ............................................................*.................................................. + // str q19, [x0, #512] // ...................................................................................................*........... + // sub v19.4S, v12.4S, v18.4S // .......................................................................................................*....... + // mul v18.4S, v13.4S, v0.S[0] // ................................................................*.............................................. + // sqrdmulh v13.4S, v13.4S, v0.S[1] // ......................................................................*........................................ + // ldr q10, [x0, #784] // ........................................................................*...................................... + // ldr q6, [x0, #912] // ..........................................................................*.................................... + // mls v18.4S, v13.4S, v8.S[0] // .....................................................................................*......................... + // sqrdmulh v13.4S, v11.4S, v26.4S // ..............................................................*................................................ + // mul v11.4S, v11.4S, v25.4S // ..................................................................*............................................ + // mul v22.4S, v9.4S, v25.4S // ...................................................................*........................................... + // str q18, [x0, #896] // ......................................................................................................*........ + // ldr q20, [x0, #16] // ................................................................................*.............................. + // mls v11.4S, v13.4S, v8.S[0] // ...........................................................................................*................... + // add v13.4S, v10.4S, v6.4S // ..........................................................................................*.................... + // ldr q31, [x0, #144] // ..................................................................................*............................ + // str q11, [x0], #(16) // ..........................................................................................................*.... + // mul v11.4S, v30.4S, v1.S[0] // ..........................................*.................................................................... + // sqrdmulh v30.4S, v9.4S, v26.4S // ....................................................................*.......................................... + // add v9.4S, v4.4S, v13.4S // .........................................................................................................*..... + // sub v6.4S, v10.4S, v6.4S // .................................................................................................*............. + // mls v14.4S, v23.4S, v8.S[0] // ..................................................*............................................................ + // mls v11.4S, v16.4S, v8.S[0] // ...................................................*........................................................... + // mls v22.4S, v30.4S, v8.S[0] // .......................................................................................*....................... + // ldr q5, [x0, #384] // ...........................................................................................................*... + // add v30.4S, v14.4S, v11.4S // ..........................................................*.................................................... + // sub v18.4S, v14.4S, v11.4S // .............................................................*................................................. + // ldr q24, [x0, #256] // ..............................................................................................................* + // sqrdmulh v21.4S, v30.4S, v26.4S // ...............................................................*............................................... + // mul v7.4S, v30.4S, v25.4S // .......................................................................*....................................... + // sub v30.4S, v4.4S, v13.4S // ................................................................................................*.............. + // sqrdmulh v13.4S, v18.4S, v0.S[1] // ......................................................................................*........................ + // mul v27.4S, v18.4S, v0.S[0] // .........................................................................................*..................... + // str q22, [x0, #112] // ...............................................................................................*............... + // sub v4.4S, v20.4S, v31.4S // .............................................................................................*................. + // mls v7.4S, v21.4S, v8.S[0] // ............................................................................................*.................. + // mls v27.4S, v13.4S, v8.S[0] // .....................................................................................................*......... + // sqrdmulh v13.4S, v17.4S, v26.4S // ..................................................................................................*............ + // mul v18.4S, v4.4S, v1.S[2] // ....................................................................................................*.......... + // str q7, [x0, #240] // .............................................................................................................*. + + sub count, count, #2 layer123_start: + // Instructions: 88 + // Expected cycles: 96 + // Expected IPC: 0.92 + // + // Cycle bound: 96.0 + // IPC bound: 0.92 + // + // Wall time: 35.25s + // User time: 35.25s + // + // -------------------------------------- cycle (expected) ---------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|-------------------- + sub v10.4S, v24.4S, v5.4S // *............................................................................................... + add v7.4S, v24.4S, v5.4S // .*.............................................................................................. + ldr q23, [x0, #656] // ..e............................................................................................. + sqrdmulh v4.4S, v4.4S, v1.S[3] // ....*........................................................................................... + add v21.4S, v20.4S, v31.4S // .....*.......................................................................................... + ldr q11, [x0, #528] // ......e......................................................................................... + add v24.4S, v21.4S, v7.4S // ........*....................................................................................... + sub v22.4S, v21.4S, v7.4S // .........*...................................................................................... + mul v15.4S, v10.4S, v2.S[0] // ..........*..................................................................................... + sqrdmulh v21.4S, v10.4S, v2.S[1] // ...........*.................................................................................... + mls v18.4S, v4.4S, v8.S[0] // ............*................................................................................... + sqrdmulh v20.4S, v28.4S, v2.S[3] // .............*.................................................................................. + mul v16.4S, v28.4S, v2.S[2] // ..............*................................................................................. + add v4.4S, v11.4S, v23.4S // ...............e................................................................................ + mul v14.4S, v22.4S, v0.S[2] // ................*............................................................................... + mul v29.4S, v6.4S, v3.S[0] // .................*.............................................................................. + sub v28.4S, v11.4S, v23.4S // ..................e............................................................................. + mul v11.4S, v19.4S, v0.S[0] // ...................l............................................................................ + sqrdmulh v23.4S, v6.4S, v3.S[1] // ....................*........................................................................... + mls v15.4S, v21.4S, v8.S[0] // .....................*.......................................................................... + sqrdmulh v10.4S, v19.4S, v0.S[1] // ......................l......................................................................... + mls v16.4S, v20.4S, v8.S[0] // .......................*........................................................................ + mls v29.4S, v23.4S, v8.S[0] // ........................*....................................................................... + sub v21.4S, v18.4S, v15.4S // .........................*...................................................................... + add v12.4S, v18.4S, v15.4S // ..........................*..................................................................... + str q27, [x0, #752] // ...........................l.................................................................... + mul v23.4S, v17.4S, v25.4S // ............................l................................................................... + sub v5.4S, v16.4S, v29.4S // .............................*.................................................................. + mul v17.4S, v21.4S, v0.S[2] // ..............................*................................................................. + add v18.4S, v16.4S, v29.4S // ...............................*................................................................ + sqrdmulh v16.4S, v30.4S, v1.S[1] // ................................*............................................................... + sqrdmulh v15.4S, v5.4S, v1.S[1] // .................................*.............................................................. + mls v23.4S, v13.4S, v8.S[0] // ..................................l............................................................. + mls v11.4S, v10.4S, v8.S[0] // ...................................l............................................................ + sqrdmulh v13.4S, v21.4S, v0.S[3] // ....................................*........................................................... + sub v21.4S, v24.4S, v9.4S // .....................................*.......................................................... + str q23, [x0, #368] // ......................................l......................................................... + mul v23.4S, v5.4S, v1.S[0] // .......................................*........................................................ + mls v17.4S, v13.4S, v8.S[0] // ........................................*....................................................... + sqrdmulh v5.4S, v21.4S, v0.S[1] // .........................................*...................................................... + mul v19.4S, v21.4S, v0.S[0] // ..........................................*..................................................... + mls v23.4S, v15.4S, v8.S[0] // ...........................................*.................................................... + str q11, [x0, #624] // ............................................l................................................... + add v11.4S, v24.4S, v9.4S // .............................................*.................................................. + mls v19.4S, v5.4S, v8.S[0] // ..............................................*................................................. + sub v13.4S, v17.4S, v23.4S // ...............................................*................................................ + add v17.4S, v17.4S, v23.4S // ................................................*............................................... + sqrdmulh v23.4S, v22.4S, v0.S[3] // .................................................*.............................................. + add v9.4S, v12.4S, v18.4S // ..................................................*............................................. + str q19, [x0, #512] // ...................................................*............................................ + sub v19.4S, v12.4S, v18.4S // ....................................................*........................................... + mul v18.4S, v13.4S, v0.S[0] // .....................................................*.......................................... + sqrdmulh v13.4S, v13.4S, v0.S[1] // ......................................................*......................................... + ldr q10, [x0, #784] // .......................................................e........................................ + ldr q6, [x0, #912] // .........................................................e...................................... + mls v18.4S, v13.4S, v8.S[0] // ...........................................................*.................................... + sqrdmulh v13.4S, v11.4S, v26.4S // ............................................................*................................... + mul v11.4S, v11.4S, v25.4S // .............................................................*.................................. + mul v22.4S, v9.4S, v25.4S // ..............................................................*................................. + str q18, [x0, #896] // ...............................................................*................................ + ldr q20, [x0, #16] // ................................................................e............................... + mls v11.4S, v13.4S, v8.S[0] // ..................................................................*............................. + add v13.4S, v10.4S, v6.4S // ...................................................................e............................ + ldr q31, [x0, #144] // ....................................................................e........................... + str q11, [x0], #(16) // ......................................................................*......................... + mul v11.4S, v30.4S, v1.S[0] // .......................................................................*........................ + sqrdmulh v30.4S, v9.4S, v26.4S // ........................................................................*....................... + add v9.4S, v4.4S, v13.4S // .........................................................................e...................... + sub v6.4S, v10.4S, v6.4S // ..........................................................................e..................... + mls v14.4S, v23.4S, v8.S[0] // ...........................................................................*.................... + mls v11.4S, v16.4S, v8.S[0] // ............................................................................*................... + mls v22.4S, v30.4S, v8.S[0] // .............................................................................*.................. + ldr q5, [x0, #384] // ..............................................................................e................. + add v30.4S, v14.4S, v11.4S // ................................................................................*............... + sub v18.4S, v14.4S, v11.4S // .................................................................................*.............. + ldr q24, [x0, #256] // ..................................................................................e............. + sqrdmulh v21.4S, v30.4S, v26.4S // ....................................................................................*........... + mul v7.4S, v30.4S, v25.4S // .....................................................................................*.......... + sub v30.4S, v4.4S, v13.4S // ......................................................................................e......... + sqrdmulh v13.4S, v18.4S, v0.S[1] // .......................................................................................*........ + mul v27.4S, v18.4S, v0.S[0] // ........................................................................................*....... + str q22, [x0, #112] // .........................................................................................*...... + sub v4.4S, v20.4S, v31.4S // ..........................................................................................e..... + mls v7.4S, v21.4S, v8.S[0] // ...........................................................................................*.... + mls v27.4S, v13.4S, v8.S[0] // ............................................................................................*... + sqrdmulh v13.4S, v17.4S, v26.4S // .............................................................................................*.. + mul v18.4S, v4.4S, v1.S[2] // ..............................................................................................e. + str q7, [x0, #240] // ...............................................................................................* - ldr q_data0, [in, #(0*(1024/8))] - ldr q_data1, [in, #(1*(1024/8))] - ldr q_data2, [in, #(2*(1024/8))] - ldr q_data3, [in, #(3*(1024/8))] - ldr q_data4, [in, #(4*(1024/8))] - ldr q_data5, [in, #(5*(1024/8))] - ldr q_data6, [in, #(6*(1024/8))] - ldr q_data7, [in, #(7*(1024/8))] - - gs_butterfly data0, data1, root1, 2, 3 - gs_butterfly data2, data3, root2, 0, 1 - gs_butterfly data4, data5, root2, 2, 3 - gs_butterfly data6, data7, root3, 0, 1 - - gs_butterfly data0, data2, root0, 2, 3 - gs_butterfly data1, data3, root0, 2, 3 - gs_butterfly data4, data6, root1, 0, 1 - gs_butterfly data5, data7, root1, 0, 1 - - // root0[0] includes ninv, manually computed. - gs_butterfly data0, data4, root0, 0, 1 - gs_butterfly data1, data5, root0, 0, 1 - gs_butterfly data2, data6, root0, 0, 1 - gs_butterfly data3, data7, root0, 0, 1 - - str q_data4, [in, #(4*(1024/8))] - str q_data5, [in, #(5*(1024/8))] - str q_data6, [in, #(6*(1024/8))] - str q_data7, [in, #(7*(1024/8))] - - // Scale half the coeffs by 1/n; for the other half, the scaling has - // been merged into the multiplication with the twiddle factor on the - // last layer. - mul_ninv data0, data1, data2, data3, data0, data1, data2, data3 - - str q_data0, [in], #(16) - str q_data1, [in, #(-16 + 1*(1024/8))] - str q_data2, [in, #(-16 + 2*(1024/8))] - str q_data3, [in, #(-16 + 3*(1024/8))] - - subs count, count, #1 + // ------------------------------------------------------------------------------------------------------------ cycle (expected) ------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|--------- + // ldr q9, [x0, #(0*(1024/8))] // ..............................................................e...............................'...............................................................~...............................'............................................ + // ldr q10, [x0, #(1*(1024/8))] // ..................................................................e...........................'...................................................................~...........................'............................................ + // ldr q11, [x0, #(2*(1024/8))] // ................................................................................e.............'.................................................................................~.............'............................................ + // ldr q12, [x0, #(3*(1024/8))] // ............................................................................e.................'.............................................................................~.................'............................................ + // ldr q13, [x0, #(4*(1024/8))] // ....e.........................................................................................'.....~.........................................................................................'.....~...................................... + // ldr q14, [x0, #(5*(1024/8))] // e.............................................................................................'.~.............................................................................................'.~.......................................... + // ldr q15, [x0, #(6*(1024/8))] // .....................................................e........................................'......................................................~........................................'............................................ + // ldr q16, [x0, #(7*(1024/8))] // .......................................................e......................................'........................................................~......................................'............................................ + // sub v24.4s, v9.4s, v10.4s // ........................................................................................e.....'.........................................................................................~.....'............................................ + // add v9.4s, v9.4s, v10.4s // ...~..........................................................................................'....*..........................................................................................'....~....................................... + // sqrdmulh v27.4s, v24.4s, v1.s[3] // ..~...........................................................................................'...*...........................................................................................'...~........................................ + // mul v10.4s, v24.4s, v1.s[2] // ............................................................................................e.'.............................................................................................~.'............................................ + // mls v10.4s, v27.4s, v8.s[0] // ..........~...................................................................................'...........*...................................................................................'...........~................................ + // sub v24.4s, v11.4s, v12.4s // ..............................................................................................*...............................................................................................~............................................ + // add v11.4s, v11.4s, v12.4s // ..............................................................................................'*..............................................................................................'~........................................... + // sqrdmulh v27.4s, v24.4s, v2.s[1] // .........~....................................................................................'..........*....................................................................................'..........~................................. + // mul v12.4s, v24.4s, v2.s[0] // ........~.....................................................................................'.........*.....................................................................................'.........~.................................. + // mls v12.4s, v27.4s, v8.s[0] // ...................~..........................................................................'....................*..........................................................................'....................~....................... + // sub v24.4s, v13.4s, v14.4s // ................e.............................................................................'.................~.............................................................................'.................~.......................... + // add v13.4s, v13.4s, v14.4s // .............e................................................................................'..............~................................................................................'..............~............................. + // sqrdmulh v27.4s, v24.4s, v2.s[3] // ...........~..................................................................................'............*..................................................................................'............~............................... + // mul v14.4s, v24.4s, v2.s[2] // ............~.................................................................................'.............*.................................................................................'.............~.............................. + // mls v14.4s, v27.4s, v8.s[0] // .....................~........................................................................'......................*........................................................................'......................~..................... + // sub v24.4s, v15.4s, v16.4s // ........................................................................e.....................'.........................................................................~.....................'............................................ + // add v15.4s, v15.4s, v16.4s // .................................................................e............................'..................................................................~............................'............................................ + // sqrdmulh v27.4s, v24.4s, v3.s[1] // ..................~...........................................................................'...................*...........................................................................'...................~........................ + // mul v16.4s, v24.4s, v3.s[0] // ...............~..............................................................................'................*..............................................................................'................~........................... + // mls v16.4s, v27.4s, v8.s[0] // ......................~.......................................................................'.......................*.......................................................................'.......................~.................... + // sub v24.4s, v9.4s, v11.4s // .......~......................................................................................'........*......................................................................................'........~................................... + // add v9.4s, v9.4s, v11.4s // ......~.......................................................................................'.......*.......................................................................................'.......~.................................... + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ...............................................~..............................................'................................................*..............................................'............................................ + // mul v11.4s, v24.4s, v0.s[2] // ..............~...............................................................................'...............*...............................................................................'...............~............................ + // mls v11.4s, v27.4s, v8.s[0] // .........................................................................~....................'..........................................................................*....................'............................................ + // sub v24.4s, v10.4s, v12.4s // .......................~......................................................................'........................*......................................................................'........................~................... + // add v10.4s, v10.4s, v12.4s // ........................~.....................................................................'.........................*.....................................................................'.........................~.................. + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ..................................~...........................................................'...................................*...........................................................'...................................~........ + // mul v12.4s, v24.4s, v0.s[2] // ............................~.................................................................'.............................*.................................................................'.............................~.............. + // mls v12.4s, v27.4s, v8.s[0] // ......................................~.......................................................'.......................................*.......................................................'.......................................~.... + // sub v24.4s, v13.4s, v15.4s // ....................................................................................e.........'.....................................................................................~.........'............................................ + // add v13.4s, v13.4s, v15.4s // .......................................................................e......................'........................................................................~......................'............................................ + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ..............................~...............................................................'...............................*...............................................................'...............................~............ + // mul v15.4s, v24.4s, v1.s[0] // .....................................................................~........................'......................................................................*........................'............................................ + // mls v15.4s, v27.4s, v8.s[0] // ..........................................................................~...................'...........................................................................*...................'............................................ + // sub v24.4s, v14.4s, v16.4s // ...........................~..................................................................'............................*..................................................................'............................~............... + // add v14.4s, v14.4s, v16.4s // .............................~................................................................'..............................*................................................................'..............................~............. + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ...............................~..............................................................'................................*..............................................................'................................~........... + // mul v16.4s, v24.4s, v1.s[0] // .....................................~........................................................'......................................*........................................................'......................................~..... + // mls v16.4s, v27.4s, v8.s[0] // .........................................~....................................................'..........................................*....................................................'..........................................~. + // sub v24.4s, v9.4s, v13.4s // ...................................~..........................................................'....................................*..........................................................'....................................~....... + // add v9.4s, v9.4s, v13.4s // ...........................................~..................................................'............................................*..................................................'............................................ + // sqrdmulh v27.4s, v24.4s, v0.s[1] // .......................................~......................................................'........................................*......................................................'........................................~... + // mul v13.4s, v24.4s, v0.s[0] // ........................................~.....................................................'.........................................*.....................................................'.........................................~.. + // mls v13.4s, v27.4s, v8.s[0] // ............................................~.................................................'.............................................*.................................................'............................................ + // sub v24.4s, v10.4s, v14.4s // ..................................................~...........................................'...................................................*...........................................'............................................ + // add v10.4s, v10.4s, v14.4s // ................................................~.............................................'.................................................*.............................................'............................................ + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ....................~.........................................................................'.....................~.........................................................................'.....................l...................... + // mul v14.4s, v24.4s, v0.s[0] // .................~............................................................................'..................~............................................................................'..................l......................... + // mls v14.4s, v27.4s, v8.s[0] // .................................~............................................................'..................................~............................................................'..................................l......... + // sub v24.4s, v11.4s, v15.4s // ...............................................................................~..............'................................................................................*..............'............................................ + // add v11.4s, v11.4s, v15.4s // ..............................................................................~...............'...............................................................................*...............'............................................ + // sqrdmulh v27.4s, v24.4s, v0.s[1] // .....................................................................................~........'......................................................................................*........'............................................ + // mul v15.4s, v24.4s, v0.s[0] // ......................................................................................~.......'.......................................................................................*.......'............................................ + // mls v15.4s, v27.4s, v8.s[0] // ..........................................................................................~...'...........................................................................................*...'............................................ + // sub v24.4s, v12.4s, v16.4s // .............................................~................................................'..............................................*................................................'............................................ + // add v12.4s, v12.4s, v16.4s // ..............................................~...............................................'...............................................*...............................................'............................................ + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ....................................................~.........................................'.....................................................*.........................................'............................................ + // mul v16.4s, v24.4s, v0.s[0] // ...................................................~..........................................'....................................................*..........................................'............................................ + // mls v16.4s, v27.4s, v8.s[0] // .........................................................~....................................'..........................................................*....................................'............................................ + // str q13, [x0, #(4*(1024/8))] // .................................................~............................................'..................................................*............................................'............................................ + // str q14, [x0, #(5*(1024/8))] // ..........................................~...................................................'...........................................~...................................................'...........................................l + // str q15, [x0, #(6*(1024/8))] // .........................~....................................................................'..........................~....................................................................'..........................l................. + // str q16, [x0, #(7*(1024/8))] // .............................................................~................................'..............................................................*................................'............................................ + // sqrdmulh v27.4s, v9.4s, v26.4s // ..........................................................~...................................'...........................................................*...................................'............................................ + // mul v9.4s, v9.4s, v25.4s // ...........................................................~..................................'............................................................*..................................'............................................ + // mls v9.4s, v27.4s, v8.s[0] // ................................................................~.............................'.................................................................*.............................'............................................ + // sqrdmulh v27.4s, v10.4s, v26.4s // ......................................................................~.......................'.......................................................................*.......................'............................................ + // mul v10.4s, v10.4s, v25.4s // ............................................................~.................................'.............................................................*.................................'............................................ + // mls v10.4s, v27.4s, v8.s[0] // ...........................................................................~..................'............................................................................*..................'............................................ + // sqrdmulh v27.4s, v11.4s, v26.4s // ..................................................................................~...........'...................................................................................*...........'............................................ + // mul v11.4s, v11.4s, v25.4s // ...................................................................................~..........'....................................................................................*..........'............................................ + // mls v11.4s, v27.4s, v8.s[0] // .........................................................................................~....'..........................................................................................*....'............................................ + // sqrdmulh v27.4s, v12.4s, v26.4s // ...........................................................................................~..'............................................................................................*..'............................................ + // mul v12.4s, v12.4s, v25.4s // ..........................~...................................................................'...........................~...................................................................'...........................l................ + // mls v12.4s, v27.4s, v8.s[0] // ................................~.............................................................'.................................~.............................................................'.................................l.......... + // str q9, [x0], #(16) // ....................................................................~.........................'.....................................................................*.........................'............................................ + // str q10, [x0, #(-16 + 1*(1024/8))] // .......................................................................................~......'........................................................................................*......'............................................ + // str q11, [x0, #(-16 + 2*(1024/8))] // .............................................................................................~'..............................................................................................*'............................................ + // str q12, [x0, #(-16 + 3*(1024/8))] // ....................................~.........................................................'.....................................~.........................................................'.....................................l...... + + sub count, count, 1 cbnz count, layer123_start + // Instructions: 80 + // Expected cycles: 82 + // Expected IPC: 0.98 + // + // Cycle bound: 82.0 + // IPC bound: 0.98 + // + // Wall time: 14.94s + // User time: 14.94s + // + // ------------------------------- cycle (expected) --------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------ + str q27, [x0, #752] // *................................................................................. + mul v23.4S, v30.4S, v1.S[0] // .*................................................................................ + sqrdmulh v14.4S, v30.4S, v1.S[1] // ..*............................................................................... + sub v11.4S, v24.4S, v5.4S // ...*.............................................................................. + add v5.4S, v24.4S, v5.4S // ....*............................................................................. + mul v24.4S, v6.4S, v3.S[0] // .....*............................................................................ + sqrdmulh v7.4S, v6.4S, v3.S[1] // ......*........................................................................... + add v29.4S, v20.4S, v31.4S // .......*.......................................................................... + mul v31.4S, v17.4S, v25.4S // ........*......................................................................... + sqrdmulh v15.4S, v19.4S, v0.S[1] // .........*........................................................................ + mul v6.4S, v19.4S, v0.S[0] // ..........*....................................................................... + sqrdmulh v27.4S, v28.4S, v2.S[3] // ...........*...................................................................... + mul v16.4S, v28.4S, v2.S[2] // ............*..................................................................... + mls v31.4S, v13.4S, v8.S[0] // .............*.................................................................... + mls v23.4S, v14.4S, v8.S[0] // ..............*................................................................... + sqrdmulh v28.4S, v4.4S, v1.S[3] // ...............*.................................................................. + add v19.4S, v29.4S, v5.4S // ................*................................................................. + mul v4.4S, v11.4S, v2.S[0] // .................*................................................................ + sqrdmulh v21.4S, v11.4S, v2.S[1] // ..................*............................................................... + sub v12.4S, v19.4S, v9.4S // ...................*.............................................................. + add v10.4S, v19.4S, v9.4S // ....................*............................................................. + sub v20.4S, v29.4S, v5.4S // .....................*............................................................ + mls v18.4S, v28.4S, v8.S[0] // ......................*........................................................... + mls v24.4S, v7.4S, v8.S[0] // .......................*.......................................................... + mls v6.4S, v15.4S, v8.S[0] // ........................*......................................................... + mls v16.4S, v27.4S, v8.S[0] // .........................*........................................................ + mul v13.4S, v20.4S, v0.S[2] // ..........................*....................................................... + mls v4.4S, v21.4S, v8.S[0] // ...........................*...................................................... + sqrdmulh v5.4S, v12.4S, v0.S[1] // ............................*..................................................... + sub v22.4S, v16.4S, v24.4S // .............................*.................................................... + str q31, [x0, #368] // ..............................*................................................... + sqrdmulh v7.4S, v20.4S, v0.S[3] // ...............................*.................................................. + sub v9.4S, v18.4S, v4.4S // ................................*................................................. + sqrdmulh v19.4S, v22.4S, v1.S[1] // .................................*................................................ + sqrdmulh v21.4S, v10.4S, v26.4S // ..................................*............................................... + mls v13.4S, v7.4S, v8.S[0] // ...................................*.............................................. + add v27.4S, v16.4S, v24.4S // ....................................*............................................. + mul v30.4S, v12.4S, v0.S[0] // .....................................*............................................ + mul v12.4S, v10.4S, v25.4S // ......................................*........................................... + sub v11.4S, v13.4S, v23.4S // .......................................*.......................................... + add v14.4S, v13.4S, v23.4S // ........................................*......................................... + mul v24.4S, v22.4S, v1.S[0] // .........................................*........................................ + mul v10.4S, v11.4S, v0.S[0] // ..........................................*....................................... + sqrdmulh v23.4S, v9.4S, v0.S[3] // ...........................................*...................................... + mul v31.4S, v14.4S, v25.4S // ............................................*..................................... + sqrdmulh v29.4S, v11.4S, v0.S[1] // .............................................*.................................... + add v20.4S, v18.4S, v4.4S // ..............................................*................................... + mul v18.4S, v9.4S, v0.S[2] // ...............................................*.................................. + mls v30.4S, v5.4S, v8.S[0] // ................................................*................................. + mls v24.4S, v19.4S, v8.S[0] // .................................................*................................ + sub v9.4S, v20.4S, v27.4S // ..................................................*............................... + sqrdmulh v5.4S, v14.4S, v26.4S // ...................................................*.............................. + mls v18.4S, v23.4S, v8.S[0] // ....................................................*............................. + sqrdmulh v4.4S, v9.4S, v0.S[1] // .....................................................*............................ + mul v17.4S, v9.4S, v0.S[0] // ......................................................*........................... + str q6, [x0, #624] // .......................................................*.......................... + mls v31.4S, v5.4S, v8.S[0] // ........................................................*......................... + add v16.4S, v20.4S, v27.4S // .........................................................*........................ + mls v12.4S, v21.4S, v8.S[0] // ..........................................................*....................... + mls v17.4S, v4.4S, v8.S[0] // ...........................................................*...................... + mls v10.4S, v29.4S, v8.S[0] // ............................................................*..................... + mul v29.4S, v16.4S, v25.4S // .............................................................*.................... + str q12, [x0], #(16) // ..............................................................*................... + add v21.4S, v18.4S, v24.4S // ...............................................................*.................. + str q10, [x0, #752] // ................................................................*................. + sqrdmulh v4.4S, v16.4S, v26.4S // .................................................................*................ + sqrdmulh v27.4S, v21.4S, v26.4S // ..................................................................*............... + str q31, [x0, #240] // ...................................................................*.............. + sub v12.4S, v18.4S, v24.4S // ....................................................................*............. + mul v16.4S, v21.4S, v25.4S // .....................................................................*............ + str q30, [x0, #496] // ......................................................................*........... + mul v23.4S, v12.4S, v0.S[0] // .......................................................................*.......... + sqrdmulh v28.4S, v12.4S, v0.S[1] // ........................................................................*......... + mls v29.4S, v4.4S, v8.S[0] // .........................................................................*........ + mls v16.4S, v27.4S, v8.S[0] // ..........................................................................*....... + str q17, [x0, #624] // ...........................................................................*...... + mls v23.4S, v28.4S, v8.S[0] // ............................................................................*..... + str q29, [x0, #112] // .............................................................................*.... + str q16, [x0, #368] // ...............................................................................*.. + str q23, [x0, #880] // .................................................................................* + + // ------------------------------- cycle (expected) --------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------ + // sub v10.4S, v24.4S, v5.4S // ...*.............................................................................. + // add v7.4S, v24.4S, v5.4S // ....*............................................................................. + // sqrdmulh v4.4S, v4.4S, v1.S[3] // ...............*.................................................................. + // add v21.4S, v20.4S, v31.4S // .......*.......................................................................... + // add v24.4S, v21.4S, v7.4S // ................*................................................................. + // sub v22.4S, v21.4S, v7.4S // .....................*............................................................ + // mul v15.4S, v10.4S, v2.S[0] // .................*................................................................ + // sqrdmulh v21.4S, v10.4S, v2.S[1] // ..................*............................................................... + // mls v18.4S, v4.4S, v8.S[0] // ......................*........................................................... + // sqrdmulh v20.4S, v28.4S, v2.S[3] // ...........*...................................................................... + // mul v16.4S, v28.4S, v2.S[2] // ............*..................................................................... + // mul v14.4S, v22.4S, v0.S[2] // ..........................*....................................................... + // mul v29.4S, v6.4S, v3.S[0] // .....*............................................................................ + // mul v11.4S, v19.4S, v0.S[0] // ..........*....................................................................... + // sqrdmulh v23.4S, v6.4S, v3.S[1] // ......*........................................................................... + // mls v15.4S, v21.4S, v8.S[0] // ...........................*...................................................... + // sqrdmulh v10.4S, v19.4S, v0.S[1] // .........*........................................................................ + // mls v16.4S, v20.4S, v8.S[0] // .........................*........................................................ + // mls v29.4S, v23.4S, v8.S[0] // .......................*.......................................................... + // sub v21.4S, v18.4S, v15.4S // ................................*................................................. + // add v12.4S, v18.4S, v15.4S // ..............................................*................................... + // str q27, [x0, #752] // *................................................................................. + // mul v23.4S, v17.4S, v25.4S // ........*......................................................................... + // sub v5.4S, v16.4S, v29.4S // .............................*.................................................... + // mul v17.4S, v21.4S, v0.S[2] // ...............................................*.................................. + // add v18.4S, v16.4S, v29.4S // ....................................*............................................. + // sqrdmulh v16.4S, v30.4S, v1.S[1] // ..*............................................................................... + // sqrdmulh v15.4S, v5.4S, v1.S[1] // .................................*................................................ + // mls v23.4S, v13.4S, v8.S[0] // .............*.................................................................... + // mls v11.4S, v10.4S, v8.S[0] // ........................*......................................................... + // sqrdmulh v13.4S, v21.4S, v0.S[3] // ...........................................*...................................... + // sub v21.4S, v24.4S, v9.4S // ...................*.............................................................. + // str q23, [x0, #368] // ..............................*................................................... + // mul v23.4S, v5.4S, v1.S[0] // .........................................*........................................ + // mls v17.4S, v13.4S, v8.S[0] // ....................................................*............................. + // sqrdmulh v5.4S, v21.4S, v0.S[1] // ............................*..................................................... + // mul v19.4S, v21.4S, v0.S[0] // .....................................*............................................ + // mls v23.4S, v15.4S, v8.S[0] // .................................................*................................ + // str q11, [x0, #624] // .......................................................*.......................... + // add v11.4S, v24.4S, v9.4S // ....................*............................................................. + // mls v19.4S, v5.4S, v8.S[0] // ................................................*................................. + // sub v13.4S, v17.4S, v23.4S // ....................................................................*............. + // add v17.4S, v17.4S, v23.4S // ...............................................................*.................. + // sqrdmulh v23.4S, v22.4S, v0.S[3] // ...............................*.................................................. + // add v9.4S, v12.4S, v18.4S // .........................................................*........................ + // str q19, [x0, #512] // ......................................................................*........... + // sub v19.4S, v12.4S, v18.4S // ..................................................*............................... + // mul v18.4S, v13.4S, v0.S[0] // .......................................................................*.......... + // sqrdmulh v13.4S, v13.4S, v0.S[1] // ........................................................................*......... + // mls v18.4S, v13.4S, v8.S[0] // ............................................................................*..... + // sqrdmulh v13.4S, v11.4S, v26.4S // ..................................*............................................... + // mul v11.4S, v11.4S, v25.4S // ......................................*........................................... + // mul v22.4S, v9.4S, v25.4S // .............................................................*.................... + // str q18, [x0, #896] // .................................................................................* + // mls v11.4S, v13.4S, v8.S[0] // ..........................................................*....................... + // str q11, [x0], #(16) // ..............................................................*................... + // mul v11.4S, v30.4S, v1.S[0] // .*................................................................................ + // sqrdmulh v30.4S, v9.4S, v26.4S // .................................................................*................ + // mls v14.4S, v23.4S, v8.S[0] // ...................................*.............................................. + // mls v11.4S, v16.4S, v8.S[0] // ..............*................................................................... + // mls v22.4S, v30.4S, v8.S[0] // .........................................................................*........ + // add v30.4S, v14.4S, v11.4S // ........................................*......................................... + // sub v18.4S, v14.4S, v11.4S // .......................................*.......................................... + // sqrdmulh v21.4S, v30.4S, v26.4S // ...................................................*.............................. + // mul v7.4S, v30.4S, v25.4S // ............................................*..................................... + // sqrdmulh v13.4S, v18.4S, v0.S[1] // .............................................*.................................... + // mul v27.4S, v18.4S, v0.S[0] // ..........................................*....................................... + // str q22, [x0, #112] // .............................................................................*.... + // mls v7.4S, v21.4S, v8.S[0] // ........................................................*......................... + // mls v27.4S, v13.4S, v8.S[0] // ............................................................*..................... + // sqrdmulh v13.4S, v17.4S, v26.4S // ..................................................................*............... + // str q7, [x0, #240] // ...................................................................*.............. + // mul v11.4S, v19.4S, v0.S[0] // ......................................................*........................... + // sqrdmulh v10.4S, v19.4S, v0.S[1] // .....................................................*............................ + // str q27, [x0, #752] // ................................................................*................. + // mul v23.4S, v17.4S, v25.4S // .....................................................................*............ + // mls v23.4S, v13.4S, v8.S[0] // ..........................................................................*....... + // mls v11.4S, v10.4S, v8.S[0] // ...........................................................*...................... + // str q23, [x0, #368] // ...............................................................................*.. + // str q11, [x0, #624] // ...........................................................................*...... + pop_stack ret