Skip to content

Commit 1411d39

Browse files
committed
PR 117048: aarch64: Add define_insn_and_split for vector ROTATE
The ultimate goal in this PR is to match the XAR pattern that is represented as a (ROTATE (XOR X Y) VCST) from the ACLE intrinsics code in the testcase. The first blocker for this was the missing recognition of ROTATE in simplify-rtx, which is fixed in the previous patch. The next problem is that once the ROTATE has been matched from the shifts and orr/xor/plus, it will try to match it in an insn before trying to combine the XOR into it. But as we don't have a backend pattern for a vector ROTATE this recog fails and combine does not try the followup XOR+ROTATE combination which would have succeeded. This patch solves that by introducing a sort of "scaffolding" pattern for vector ROTATE, which allows it to be combined into the XAR. If it fails to be combined into anything the splitter will break it back down into the SHL+USRA sequence that it would have emitted. By having this splitter we can special-case some rotate amounts in the future to emit more specialised instructions e.g. from the REV* family. This can be done if the ROTATE is not combined into something else. This optimisation is done in the next patch in the series. Bootstrapped and tested on aarch64-none-linux-gnu. Signed-off-by: Kyrylo Tkachov <[email protected]> gcc/ PR target/117048 * config/aarch64/aarch64-simd.md (*aarch64_simd_rotate_imm<mode>): New define_insn_and_split. gcc/testsuite/ PR target/117048 * gcc.target/aarch64/simd/pr117048.c: New test.
1 parent 1e5ff11 commit 1411d39

File tree

2 files changed

+102
-0
lines changed

2 files changed

+102
-0
lines changed

gcc/config/aarch64/aarch64-simd.md

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1294,6 +1294,35 @@
12941294
[(set_attr "type" "neon_shift_acc<q>")]
12951295
)
12961296

1297+
;; After all the combinations and propagations of ROTATE have been
1298+
;; attempted split any remaining vector rotates into SHL + USRA sequences.
1299+
(define_insn_and_split "*aarch64_simd_rotate_imm<mode>"
1300+
[(set (match_operand:VDQ_I 0 "register_operand" "=&w")
1301+
(rotate:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
1302+
(match_operand:VDQ_I 2 "aarch64_simd_lshift_imm")))]
1303+
"TARGET_SIMD"
1304+
"#"
1305+
"&& 1"
1306+
[(set (match_dup 3)
1307+
(ashift:VDQ_I (match_dup 1)
1308+
(match_dup 2)))
1309+
(set (match_dup 0)
1310+
(plus:VDQ_I
1311+
(lshiftrt:VDQ_I
1312+
(match_dup 1)
1313+
(match_dup 4))
1314+
(match_dup 3)))]
1315+
{
1316+
operands[3] = reload_completed ? operands[0] : gen_reg_rtx (<MODE>mode);
1317+
rtx shft_amnt = unwrap_const_vec_duplicate (operands[2]);
1318+
int bitwidth = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;
1319+
operands[4]
1320+
= aarch64_simd_gen_const_vector_dup (<MODE>mode,
1321+
bitwidth - INTVAL (shft_amnt));
1322+
}
1323+
[(set_attr "length" "8")]
1324+
)
1325+
12971326
(define_insn "aarch64_<sra_op>rsra_n<mode>_insn"
12981327
[(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w")
12991328
(plus:VSDQ_I_DI
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
/* { dg-do compile } */
2+
/* { dg-options "-O2" } */
3+
/* { dg-final { check-function-bodies "**" "" "" } } */
4+
5+
#include <arm_neon.h>
6+
7+
#pragma GCC target "+sha3"
8+
9+
/*
10+
** func_shl_eor:
11+
** xar v0\.2d, v([0-9]+)\.2d, v([0-9]+)\.2d, 63
12+
** ret
13+
*/
14+
uint64x2_t
15+
func_shl_eor (uint64x2_t a, uint64x2_t b) {
16+
uint64x2_t c = veorq_u64 (a, b);
17+
return veorq_u64(vshlq_n_u64(c, 1), vshrq_n_u64(c, 63));
18+
}
19+
20+
/*
21+
** func_add_eor:
22+
** xar v0\.2d, v([0-9]+)\.2d, v([0-9]+)\.2d, 63
23+
** ret
24+
*/
25+
uint64x2_t
26+
func_add_eor (uint64x2_t a, uint64x2_t b) {
27+
uint64x2_t c = veorq_u64 (a, b);
28+
return veorq_u64(vaddq_u64(c, c), vshrq_n_u64(c, 63));
29+
}
30+
31+
/*
32+
** func_shl_orr:
33+
** xar v0\.2d, v([0-9]+)\.2d, v([0-9]+)\.2d, 63
34+
** ret
35+
*/
36+
uint64x2_t
37+
func_shl_orr (uint64x2_t a, uint64x2_t b) {
38+
uint64x2_t c = veorq_u64 (a, b);
39+
return vorrq_u64(vshlq_n_u64(c, 1), vshrq_n_u64(c, 63));
40+
}
41+
42+
/*
43+
** func_add_orr:
44+
** xar v0\.2d, v([0-9]+)\.2d, v([0-9]+)\.2d, 63
45+
** ret
46+
*/
47+
uint64x2_t
48+
func_add_orr (uint64x2_t a, uint64x2_t b) {
49+
uint64x2_t c = veorq_u64 (a, b);
50+
return vorrq_u64(vaddq_u64(c, c), vshrq_n_u64(c, 63));
51+
}
52+
53+
/*
54+
** func_shl_add:
55+
** xar v0\.2d, v([0-9]+)\.2d, v([0-9]+)\.2d, 63
56+
** ret
57+
*/
58+
uint64x2_t
59+
func_shl_add (uint64x2_t a, uint64x2_t b) {
60+
uint64x2_t c = veorq_u64 (a, b);
61+
return vaddq_u64(vshlq_n_u64(c, 1), vshrq_n_u64(c, 63));
62+
}
63+
64+
/*
65+
** func_add_add:
66+
** xar v0\.2d, v([0-9]+)\.2d, v([0-9]+)\.2d, 63
67+
** ret
68+
*/
69+
uint64x2_t
70+
func_add_add (uint64x2_t a, uint64x2_t b) {
71+
uint64x2_t c = veorq_u64 (a, b);
72+
return vaddq_u64(vaddq_u64(c, c), vshrq_n_u64(c, 63));
73+
}

0 commit comments

Comments
 (0)