1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
| ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
;CHECK-LABEL: {{^}}s_buffer_load_imm:
;CHECK-NOT: s_waitcnt;
;CHECK: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x4
define amdgpu_ps void @s_buffer_load_imm(<4 x i32> inreg %desc) {
main_body:
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0)
%bitcast = bitcast i32 %load to float
call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true)
ret void
}
;CHECK-LABEL: {{^}}s_buffer_load_index:
;CHECK-NOT: s_waitcnt;
;CHECK: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
define amdgpu_ps void @s_buffer_load_index(<4 x i32> inreg %desc, i32 inreg %index) {
main_body:
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %index, i32 0)
%bitcast = bitcast i32 %load to float
call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true)
ret void
}
;CHECK-LABEL: {{^}}s_buffer_loadx2_imm:
;CHECK-NOT: s_waitcnt;
;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x40
define amdgpu_ps void @s_buffer_loadx2_imm(<4 x i32> inreg %desc) {
main_body:
%load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 64, i32 0)
%bitcast = bitcast <2 x i32> %load to <2 x float>
%x = extractelement <2 x float> %bitcast, i32 0
%y = extractelement <2 x float> %bitcast, i32 1
call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true)
ret void
}
;CHECK-LABEL: {{^}}s_buffer_loadx2_index:
;CHECK-NOT: s_waitcnt;
;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
define amdgpu_ps void @s_buffer_loadx2_index(<4 x i32> inreg %desc, i32 inreg %index) {
main_body:
%load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %index, i32 0)
%bitcast = bitcast <2 x i32> %load to <2 x float>
%x = extractelement <2 x float> %bitcast, i32 0
%y = extractelement <2 x float> %bitcast, i32 1
call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true)
ret void
}
;CHECK-LABEL: {{^}}s_buffer_loadx4_imm:
;CHECK-NOT: s_waitcnt;
;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0xc8
define amdgpu_ps void @s_buffer_loadx4_imm(<4 x i32> inreg %desc) {
main_body:
%load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 200, i32 0)
%bitcast = bitcast <4 x i32> %load to <4 x float>
%x = extractelement <4 x float> %bitcast, i32 0
%y = extractelement <4 x float> %bitcast, i32 1
%z = extractelement <4 x float> %bitcast, i32 2
%w = extractelement <4 x float> %bitcast, i32 3
call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true)
ret void
}
;CHECK-LABEL: {{^}}s_buffer_loadx4_index:
;CHECK-NOT: s_waitcnt;
;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
define amdgpu_ps void @s_buffer_loadx4_index(<4 x i32> inreg %desc, i32 inreg %index) {
main_body:
%load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %index, i32 0)
%bitcast = bitcast <4 x i32> %load to <4 x float>
%x = extractelement <4 x float> %bitcast, i32 0
%y = extractelement <4 x float> %bitcast, i32 1
%z = extractelement <4 x float> %bitcast, i32 2
%w = extractelement <4 x float> %bitcast, i32 3
call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true)
ret void
}
;CHECK-LABEL: {{^}}s_buffer_load_imm_mergex2:
;CHECK-NOT: s_waitcnt;
;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x4
define amdgpu_ps void @s_buffer_load_imm_mergex2(<4 x i32> inreg %desc) {
main_body:
%load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0)
%load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0)
%x = bitcast i32 %load0 to float
%y = bitcast i32 %load1 to float
call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true)
ret void
}
;CHECK-LABEL: {{^}}s_buffer_load_imm_mergex4:
;CHECK-NOT: s_waitcnt;
;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x8
define amdgpu_ps void @s_buffer_load_imm_mergex4(<4 x i32> inreg %desc) {
main_body:
%load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0)
%load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 12, i32 0)
%load2 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 16, i32 0)
%load3 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 20, i32 0)
%x = bitcast i32 %load0 to float
%y = bitcast i32 %load1 to float
%z = bitcast i32 %load2 to float
%w = bitcast i32 %load3 to float
call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true)
ret void
}
;CHECK-LABEL: {{^}}s_buffer_load_index_across_bb:
;CHECK-NOT: s_waitcnt;
;CHECK: v_or_b32
;CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
define amdgpu_ps void @s_buffer_load_index_across_bb(<4 x i32> inreg %desc, i32 %index) {
main_body:
%tmp = shl i32 %index, 4
br label %bb1
bb1: ; preds = %main_body
%tmp1 = or i32 %tmp, 8
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %tmp1, i32 0)
%bitcast = bitcast i32 %load to float
call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true)
ret void
}
;CHECK-LABEL: {{^}}s_buffer_load_index_across_bb_merged:
;CHECK-NOT: s_waitcnt;
;CHECK: v_or_b32
;CHECK: v_or_b32
;CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
;CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
define amdgpu_ps void @s_buffer_load_index_across_bb_merged(<4 x i32> inreg %desc, i32 %index) {
main_body:
%tmp = shl i32 %index, 4
br label %bb1
bb1: ; preds = %main_body
%tmp1 = or i32 %tmp, 8
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %tmp1, i32 0)
%tmp2 = or i32 %tmp1, 4
%load2 = tail call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %tmp2, i32 0)
%bitcast = bitcast i32 %load to float
%bitcast2 = bitcast i32 %load2 to float
call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float %bitcast2, float undef, float undef, i1 true, i1 true)
ret void
}
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1)
declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32)
declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32)
declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32)
|