Skip to content

Commit cd0f560

Browse files
authored
[AArch64][SME] Precommit tests for LUT4I Chain issues (NFC) (llvm#161505)
These tests show that `luti4` intrinsics are currently incorrectly CSD'd.
1 parent 58c959b commit cd0f560

File tree

3 files changed

+89
-0
lines changed

3 files changed

+89
-0
lines changed
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: llc < %s | FileCheck %s
3+
4+
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128-Fn32"
5+
target triple = "arm64-apple-macosx15.0.0"
6+
7+
; From: https://github.com/llvm/llvm-project/issues/161420. This test checks that
8+
; two `luti4` instructions are emitted. FIXME: This is currently broken!
9+
define void @pluto(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3) #0 {
10+
; CHECK-LABEL: pluto:
11+
; CHECK: ; %bb.0: ; %bb
12+
; CHECK-NEXT: mov w8, #0 ; =0x0
13+
; CHECK-NEXT: ldr zt0, [x1]
14+
; CHECK-NEXT: ldr z0, [x3]
15+
; CHECK-NEXT: ptrue pn8.h
16+
; CHECK-NEXT: ld1h { z4.h - z7.h }, pn8/z, [x0]
17+
; CHECK-NEXT: luti4 { z0.h - z3.h }, zt0, z0[0]
18+
; CHECK-NEXT: fmla za.h[w8, 2, vgx4], { z4.h - z7.h }, { z0.h - z3.h }
19+
; CHECK-NEXT: ret
20+
bb:
21+
tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr %arg1)
22+
%load = load <vscale x 16 x i8>, ptr %arg3, align 16
23+
%call = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c16()
24+
%call4 = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount") %call, ptr %arg)
25+
%extractvalue = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call4, 0
26+
%extractvalue5 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call4, 1
27+
%extractvalue6 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call4, 2
28+
%extractvalue7 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call4, 3
29+
%call8 = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8f16(i32 0, <vscale x 16 x i8> %load, i32 0)
30+
%extractvalue9 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call8, 0
31+
%extractvalue10 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call8, 1
32+
%extractvalue11 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call8, 2
33+
%extractvalue12 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call8, 3
34+
tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 0, <vscale x 8 x half> %extractvalue, <vscale x 8 x half> %extractvalue5, <vscale x 8 x half> %extractvalue6, <vscale x 8 x half> %extractvalue7, <vscale x 8 x half> %extractvalue9, <vscale x 8 x half> %extractvalue10, <vscale x 8 x half> %extractvalue11, <vscale x 8 x half> %extractvalue12)
35+
tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr %arg2)
36+
%call13 = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8f16(i32 0, <vscale x 16 x i8> %load, i32 0)
37+
%extractvalue14 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call13, 0
38+
%extractvalue15 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call13, 1
39+
%extractvalue16 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call13, 2
40+
%extractvalue17 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call13, 3
41+
tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 2, <vscale x 8 x half> %extractvalue, <vscale x 8 x half> %extractvalue5, <vscale x 8 x half> %extractvalue6, <vscale x 8 x half> %extractvalue7, <vscale x 8 x half> %extractvalue14, <vscale x 8 x half> %extractvalue15, <vscale x 8 x half> %extractvalue16, <vscale x 8 x half> %extractvalue17)
42+
ret void
43+
}
44+
45+
declare void @llvm.aarch64.sme.ldr.zt(i32, ptr)
46+
declare target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c16()
47+
declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount"), ptr)
48+
declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8f16(i32 immarg, <vscale x 16 x i8>, i32 immarg)
49+
declare void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
50+
51+
attributes #0 = { mustprogress nofree noinline norecurse nosync nounwind ssp willreturn uwtable(sync) "aarch64_inout_za" "aarch64_inout_zt0" "aarch64_pstate_sm_enabled" "target-cpu"="apple-m1" "target-features"="+fp-armv8,+lse,+neon,+sme,+sme-f16f16,+sme2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a" }

llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,24 @@ define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscal
4848
ret {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} %res
4949
}
5050

51+
; Tests multiple identical luti4 intrinsics with ZT0 loads interspersed, are not CSD'd.
52+
; FIXME: This is currently broken!
53+
define void @test_multiple_luti4_zt_i8(ptr %ptrA, ptr %ptrB, <vscale x 16 x i8> %x) {
54+
; CHECK-LABEL: test_multiple_luti4_zt_i8:
55+
; CHECK: // %bb.0:
56+
; CHECK-NEXT: luti4 { z0.s - z3.s }, zt0, z0[1]
57+
; CHECK-NEXT: // fake_use: $z0 $z0_z1_z2_z3
58+
; CHECK-NEXT: ret
59+
tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr %ptrA)
60+
%res1 = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4f32(i32 0, <vscale x 16 x i8> %x, i32 1)
61+
tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr %ptrB)
62+
%res2 = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4f32(i32 0, <vscale x 16 x i8> %x, i32 1)
63+
64+
call void (...) @llvm.fake.use({<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} %res1)
65+
call void (...) @llvm.fake.use({<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} %res2)
66+
ret void
67+
}
68+
5169
declare {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8i16(i32, <vscale x 16 x i8>, i32)
5270
declare {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4i32(i32, <vscale x 16 x i8>, i32)
5371
declare {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8bf16(i32, <vscale x 16 x i8>, i32)

llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,24 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16
1414
ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %res
1515
}
1616

17+
; Tests multiple identical luti4 intrinsics with ZT0 loads interspersed, are not CSD'd.
18+
; FIXME: This is currently broken!
19+
define void @test_multiple_luti4_zt_i8(ptr %ptrA, ptr %ptrB, <vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1) #0 {
20+
; CHECK-LABEL: test_multiple_luti4_zt_i8:
21+
; CHECK: // %bb.0:
22+
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
23+
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
24+
; CHECK-NEXT: luti4 { z0.b - z3.b }, zt0, { z0, z1 }
25+
; CHECK-NEXT: // fake_use: $z0 $z0_z1_z2_z3
26+
; CHECK-NEXT: ret
27+
tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr %ptrA)
28+
%res1 = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1)
29+
tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr %ptrB)
30+
%res2 = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1)
31+
32+
call void (...) @llvm.fake.use({ <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res1)
33+
call void (...) @llvm.fake.use({ <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2)
34+
ret void
35+
}
36+
1737
attributes #0 = { "target-features"="+sme2,+sme-lutv2"}

0 commit comments

Comments
 (0)