1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
| ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefix=GFX9 %s
; Test for a conv2d like sequence of loads.
; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}{{$}}
; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-16{{$}}
; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-32{{$}}
; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:8{{$}}
define hidden amdgpu_kernel void @simpleSaddrs(i64 addrspace(1)* %dst_image, i64 addrspace(1)* %src_image ) {
entry:
%id = call i32 @llvm.amdgcn.workitem.id.x()
%idx = zext i32 %id to i64
%gep = getelementptr i64, i64 addrspace(1)* %src_image, i64 %idx
%ptr0 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 1
%load0 = load i64, i64 addrspace(1)* %ptr0
%ptr1 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 2
%load1 = load i64, i64 addrspace(1)* %ptr1
%ptr2 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 3
%load2 = load i64, i64 addrspace(1)* %ptr2
%ptr3 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 4
%load3 = load i64, i64 addrspace(1)* %ptr3
%ptr4 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -4
%load4 = load i64, i64 addrspace(1)* %ptr4
%ptr5 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -3
%load5 = load i64, i64 addrspace(1)* %ptr5
%ptr6 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -2
%load6 = load i64, i64 addrspace(1)* %ptr6
%ptr7 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -1
%load7 = load i64, i64 addrspace(1)* %ptr7
%ptr8 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 0
%load8 = load i64, i64 addrspace(1)* %ptr8
%add0 = add i64 %load1, %load0
%add1 = add i64 %load3, %load2
%add2 = add i64 %load5, %load4
%add3 = add i64 %load7, %load6
%add4 = add i64 %add0, %load8
%add5 = add i64 %add2, %add1
%add6 = add i64 %add4, %add3
%add7 = add i64 %add6, %add5
%gep9 = getelementptr i64, i64 addrspace(1)* %dst_image, i64 %idx
%ptr9 = getelementptr inbounds i64, i64 addrspace(1)* %gep9, i64 1
store volatile i64 %add7, i64 addrspace(1)* %ptr9
; Test various offset boundaries.
; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:4088{{$}}
; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4088{{$}}
; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2040{{$}}
%gep11 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 511
%load11 = load i64, i64 addrspace(1)* %gep11
%gep12 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 1023
%load12 = load i64, i64 addrspace(1)* %gep12
%gep13 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 255
%load13 = load i64, i64 addrspace(1)* %gep13
%add11 = add i64 %load11, %load12
%add12 = add i64 %add11, %load13
store volatile i64 %add12, i64 addrspace(1)* undef
; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-4096{{$}}
%gep21 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -1024
%load21 = load i64, i64 addrspace(1)* %gep21
%gep22 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -2048
%load22 = load i64, i64 addrspace(1)* %gep22
%gep23 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -512
%load23 = load i64, i64 addrspace(1)* %gep23
%add21 = add i64 %load22, %load21
%add22 = add i64 %add21, %load23
store volatile i64 %add22, i64 addrspace(1)* undef
; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2040{{$}}
%gep31 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 257
%load31 = load i64, i64 addrspace(1)* %gep31
%gep32 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 256
%load32 = load i64, i64 addrspace(1)* %gep32
%gep33 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 255
%load33 = load i64, i64 addrspace(1)* %gep33
%add34 = add i64 %load32, %load31
%add35 = add i64 %add34, %load33
store volatile i64 %add35, i64 addrspace(1)* undef
ret void
}
; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:16{{$}}
; GFX9-NEXT: s_waitcnt
; NGFX9-NOT: global_load_dword
define amdgpu_cs void @_amdgpu_cs_main(i64 inreg %arg) {
bb:
%tmp1 = inttoptr i64 %arg to <4 x i64> addrspace(1)*
%tmp2 = load <4 x i64>, <4 x i64> addrspace(1)* %tmp1, align 16
store volatile <4 x i64> %tmp2, <4 x i64> addrspace(1)* undef
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x() #1
attributes #0 = { convergent nounwind }
attributes #1 = { nounwind readnone speculatable }
|