1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=GCN-AA %s 3 4; This test is mostly to test DAG store merging, so disable the vectorizer. 5; Run with devices with different unaligned load restrictions. 6 7; TODO: Vector element tests 8; TODO: Non-zero base offset for load and store combinations 9; TODO: Same base addrspacecasted 10 11 12; GCN-LABEL: {{^}}merge_global_store_2_constants_i8: 13; GCN: buffer_store_short 14; GCN: s_endpgm 15define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 { 16 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 17 18 store i8 123, i8 addrspace(1)* %out.gep.1 19 store i8 456, i8 addrspace(1)* %out, align 2 20 ret void 21} 22 23; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align: 24; GCN: buffer_store_byte 25; GCN: buffer_store_byte 26; GCN: s_endpgm 27define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 { 28 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 29 30 store i8 123, i8 addrspace(1)* %out.gep.1 31 store i8 456, i8 addrspace(1)* %out 32 ret void 33} 34 35; GCN-LABEL: {{^}}merge_global_store_2_constants_i16: 36; GCN: buffer_store_dword v 37define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 { 38 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 39 40 store i16 123, i16 addrspace(1)* %out.gep.1 41 store i16 456, i16 addrspace(1)* %out, align 4 42 ret void 43} 44 45; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16: 46; GCN: buffer_store_dword v 47define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 { 48 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 49 50 store i16 0, i16 addrspace(1)* %out.gep.1 51 store i16 0, i16 addrspace(1)* %out, align 4 52 ret void 53} 54 55; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align: 56; GCN: buffer_store_short 57; GCN: buffer_store_short 58; GCN: s_endpgm 59define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 { 60 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 61 62 store i16 123, i16 addrspace(1)* %out.gep.1 63 store i16 456, i16 addrspace(1)* %out 64 ret void 65} 66 67; GCN-LABEL: {{^}}merge_global_store_2_constants_i32: 68; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8 69; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b 70; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]] 71define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 { 72 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 73 74 store i32 123, i32 addrspace(1)* %out.gep.1 75 store i32 456, i32 addrspace(1)* %out 76 ret void 77} 78 79; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32: 80; GCN: buffer_store_dwordx2 81define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 { 82 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 83 %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)* 84 store float 1.0, float addrspace(1)* %out.gep.1.bc 85 store i32 456, i32 addrspace(1)* %out 86 ret void 87} 88 89; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32: 90; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0 91; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b 92; GCN: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]] 93define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 { 94 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 95 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)* 96 store i32 123, i32 addrspace(1)* %out.gep.1.bc 97 store float 4.0, float addrspace(1)* %out 98 ret void 99} 100 101; GCN-LABEL: {{^}}merge_global_store_4_constants_i32: 102; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}} 103; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}} 104; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}} 105; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}} 106; GCN: buffer_store_dwordx4 v[[[LO]]:[[HI]]] 107define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 { 108 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 109 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 110 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 111 112 store i32 123, i32 addrspace(1)* %out.gep.1 113 store i32 456, i32 addrspace(1)* %out.gep.2 114 store i32 333, i32 addrspace(1)* %out.gep.3 115 store i32 1234, i32 addrspace(1)* %out 116 ret void 117} 118 119; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order: 120; GCN: buffer_store_dwordx4 121define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 { 122 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 123 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 124 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 125 126 store float 8.0, float addrspace(1)* %out 127 store float 1.0, float addrspace(1)* %out.gep.1 128 store float 2.0, float addrspace(1)* %out.gep.2 129 store float 4.0, float addrspace(1)* %out.gep.3 130 ret void 131} 132 133; First store is out of order. 134; GCN-LABEL: {{^}}merge_global_store_4_constants_f32: 135; GCN: buffer_store_dwordx4 136define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 { 137 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 138 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 139 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 140 141 store float 1.0, float addrspace(1)* %out.gep.1 142 store float 2.0, float addrspace(1)* %out.gep.2 143 store float 4.0, float addrspace(1)* %out.gep.3 144 store float 8.0, float addrspace(1)* %out 145 ret void 146} 147 148; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32: 149; GCN-AA: buffer_store_dwordx4 v 150; GCN: s_endpgm 151define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 { 152 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 153 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 154 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 155 156 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)* 157 %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)* 158 159 store i32 11, i32 addrspace(1)* %out.gep.1.bc 160 store float 2.0, float addrspace(1)* %out.gep.2 161 store i32 17, i32 addrspace(1)* %out.gep.3.bc 162 store float 8.0, float addrspace(1)* %out 163 ret void 164} 165 166; GCN-LABEL: {{^}}merge_global_store_3_constants_i32: 167; SI-DAG: buffer_store_dwordx2 168; SI-DAG: buffer_store_dword v 169; CI-DAG: buffer_store_dwordx3 170; GCN-NOT: buffer_store_dword 171; GCN: s_endpgm 172define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 { 173 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 174 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 175 176 store i32 123, i32 addrspace(1)* %out.gep.1 177 store i32 456, i32 addrspace(1)* %out.gep.2 178 store i32 1234, i32 addrspace(1)* %out 179 ret void 180} 181 182; GCN-LABEL: {{^}}merge_global_store_2_constants_i64: 183; GCN: buffer_store_dwordx4 184define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 { 185 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 186 187 store i64 123, i64 addrspace(1)* %out.gep.1 188 store i64 456, i64 addrspace(1)* %out 189 ret void 190} 191 192; GCN-LABEL: {{^}}merge_global_store_4_constants_i64: 193; GCN: buffer_store_dwordx4 194; GCN: buffer_store_dwordx4 195define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 { 196 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 197 %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2 198 %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3 199 200 store i64 123, i64 addrspace(1)* %out.gep.1 201 store i64 456, i64 addrspace(1)* %out.gep.2 202 store i64 333, i64 addrspace(1)* %out.gep.3 203 store i64 1234, i64 addrspace(1)* %out 204 ret void 205} 206 207; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32: 208; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] 209; GCN: buffer_store_dwordx2 [[LOAD]] 210define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 211 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 212 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 213 214 %lo = load i32, i32 addrspace(1)* %in 215 %hi = load i32, i32 addrspace(1)* %in.gep.1 216 217 store i32 %lo, i32 addrspace(1)* %out 218 store i32 %hi, i32 addrspace(1)* %out.gep.1 219 ret void 220} 221 222; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base: 223; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 224; GCN: buffer_store_dwordx2 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 225define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 226 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2 227 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3 228 229 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2 230 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3 231 %lo = load i32, i32 addrspace(1)* %in.gep.0 232 %hi = load i32, i32 addrspace(1)* %in.gep.1 233 234 store i32 %lo, i32 addrspace(1)* %out.gep.0 235 store i32 %hi, i32 addrspace(1)* %out.gep.1 236 ret void 237} 238 239; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32: 240; GCN: buffer_load_dwordx2 v 241; GCN: buffer_store_dwordx2 v 242define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 243 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 244 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 245 246 %lo = load i32, i32 addrspace(1)* %in 247 %hi = load i32, i32 addrspace(1)* %in.gep.1 248 249 store i32 %hi, i32 addrspace(1)* %out 250 store i32 %lo, i32 addrspace(1)* %out.gep.1 251 ret void 252} 253 254; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32: 255; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 256; GCN: buffer_store_dwordx4 [[LOAD]] 257define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 258 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 259 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 260 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 261 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 262 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 263 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 264 265 %x = load i32, i32 addrspace(1)* %in 266 %y = load i32, i32 addrspace(1)* %in.gep.1 267 %z = load i32, i32 addrspace(1)* %in.gep.2 268 %w = load i32, i32 addrspace(1)* %in.gep.3 269 270 store i32 %x, i32 addrspace(1)* %out 271 store i32 %y, i32 addrspace(1)* %out.gep.1 272 store i32 %z, i32 addrspace(1)* %out.gep.2 273 store i32 %w, i32 addrspace(1)* %out.gep.3 274 ret void 275} 276 277; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32: 278; SI-DAG: buffer_load_dwordx2 279; SI-DAG: buffer_load_dword 280; CI-DAG: buffer_load_dwordx3 281; GCN: s_waitcnt 282; SI-DAG: buffer_store_dwordx2 283; SI-DAG: buffer_store_dword v 284; CI-DAG: buffer_store_dwordx3 285; GCN: s_endpgm 286define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 287 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 288 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 289 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 290 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 291 292 %x = load i32, i32 addrspace(1)* %in 293 %y = load i32, i32 addrspace(1)* %in.gep.1 294 %z = load i32, i32 addrspace(1)* %in.gep.2 295 296 store i32 %x, i32 addrspace(1)* %out 297 store i32 %y, i32 addrspace(1)* %out.gep.1 298 store i32 %z, i32 addrspace(1)* %out.gep.2 299 ret void 300} 301 302; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32: 303; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 304; GCN: buffer_store_dwordx4 [[LOAD]] 305define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 306 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 307 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 308 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 309 %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1 310 %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2 311 %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3 312 313 %x = load float, float addrspace(1)* %in 314 %y = load float, float addrspace(1)* %in.gep.1 315 %z = load float, float addrspace(1)* %in.gep.2 316 %w = load float, float addrspace(1)* %in.gep.3 317 318 store float %x, float addrspace(1)* %out 319 store float %y, float addrspace(1)* %out.gep.1 320 store float %z, float addrspace(1)* %out.gep.2 321 store float %w, float addrspace(1)* %out.gep.3 322 ret void 323} 324 325; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base: 326; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 327; GCN: buffer_store_dwordx4 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28 328define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 329 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11 330 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12 331 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13 332 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14 333 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7 334 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8 335 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9 336 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10 337 338 %x = load i32, i32 addrspace(1)* %in.gep.0 339 %y = load i32, i32 addrspace(1)* %in.gep.1 340 %z = load i32, i32 addrspace(1)* %in.gep.2 341 %w = load i32, i32 addrspace(1)* %in.gep.3 342 343 store i32 %x, i32 addrspace(1)* %out.gep.0 344 store i32 %y, i32 addrspace(1)* %out.gep.1 345 store i32 %z, i32 addrspace(1)* %out.gep.2 346 store i32 %w, i32 addrspace(1)* %out.gep.3 347 ret void 348} 349 350; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32: 351; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 352; GCN: s_barrier 353; GCN: buffer_store_dwordx4 [[LOAD]] 354define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 355 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 356 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 357 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 358 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 359 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 360 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 361 362 %x = load i32, i32 addrspace(1)* %in 363 %y = load i32, i32 addrspace(1)* %in.gep.1 364 %z = load i32, i32 addrspace(1)* %in.gep.2 365 %w = load i32, i32 addrspace(1)* %in.gep.3 366 367 ; Make sure the barrier doesn't stop this 368 tail call void @llvm.amdgcn.s.barrier() #1 369 370 store i32 %w, i32 addrspace(1)* %out.gep.3 371 store i32 %z, i32 addrspace(1)* %out.gep.2 372 store i32 %y, i32 addrspace(1)* %out.gep.1 373 store i32 %x, i32 addrspace(1)* %out 374 375 ret void 376} 377 378; TODO: Re-packing of loaded register required. Maybe an IR pass 379; should catch this? 380 381; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32: 382; GCN: buffer_load_dwordx4 v 383; GCN: s_barrier 384; GCN: buffer_store_dwordx4 v 385define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 386 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 387 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 388 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 389 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 390 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 391 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 392 393 %x = load i32, i32 addrspace(1)* %in 394 %y = load i32, i32 addrspace(1)* %in.gep.1 395 %z = load i32, i32 addrspace(1)* %in.gep.2 396 %w = load i32, i32 addrspace(1)* %in.gep.3 397 398 ; Make sure the barrier doesn't stop this 399 tail call void @llvm.amdgcn.s.barrier() #1 400 401 store i32 %w, i32 addrspace(1)* %out 402 store i32 %z, i32 addrspace(1)* %out.gep.1 403 store i32 %y, i32 addrspace(1)* %out.gep.2 404 store i32 %x, i32 addrspace(1)* %out.gep.3 405 406 ret void 407} 408 409; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8: 410; GCN: buffer_load_dword [[LOAD:v[0-9]+]] 411; GCN: buffer_store_dword [[LOAD]] 412; GCN: s_endpgm 413define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { 414 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 415 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 416 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 417 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1 418 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2 419 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3 420 421 %x = load i8, i8 addrspace(1)* %in, align 4 422 %y = load i8, i8 addrspace(1)* %in.gep.1 423 %z = load i8, i8 addrspace(1)* %in.gep.2 424 %w = load i8, i8 addrspace(1)* %in.gep.3 425 426 store i8 %x, i8 addrspace(1)* %out, align 4 427 store i8 %y, i8 addrspace(1)* %out.gep.1 428 store i8 %z, i8 addrspace(1)* %out.gep.2 429 store i8 %w, i8 addrspace(1)* %out.gep.3 430 ret void 431} 432 433; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align: 434; GCN: buffer_load_ubyte 435; GCN: buffer_load_ubyte 436; GCN: buffer_load_ubyte 437; GCN: buffer_load_ubyte 438; GCN: buffer_store_byte 439; GCN: buffer_store_byte 440; GCN: buffer_store_byte 441; GCN: buffer_store_byte 442; GCN: s_endpgm 443define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { 444 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 445 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 446 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 447 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1 448 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2 449 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3 450 451 %x = load i8, i8 addrspace(1)* %in 452 %y = load i8, i8 addrspace(1)* %in.gep.1 453 %z = load i8, i8 addrspace(1)* %in.gep.2 454 %w = load i8, i8 addrspace(1)* %in.gep.3 455 456 store i8 %x, i8 addrspace(1)* %out 457 store i8 %y, i8 addrspace(1)* %out.gep.1 458 store i8 %z, i8 addrspace(1)* %out.gep.2 459 store i8 %w, i8 addrspace(1)* %out.gep.3 460 ret void 461} 462 463; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32: 464; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 465; GCN: buffer_store_dwordx4 [[LOAD]] 466; GCN: s_endpgm 467define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { 468 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 469 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 470 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 471 %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in 472 473 %x = extractelement <4 x i32> %vec, i32 0 474 %y = extractelement <4 x i32> %vec, i32 1 475 %z = extractelement <4 x i32> %vec, i32 2 476 %w = extractelement <4 x i32> %vec, i32 3 477 478 store i32 %x, i32 addrspace(1)* %out 479 store i32 %y, i32 addrspace(1)* %out.gep.1 480 store i32 %z, i32 addrspace(1)* %out.gep.2 481 store i32 %w, i32 addrspace(1)* %out.gep.3 482 ret void 483} 484 485; GCN-LABEL: {{^}}merge_local_store_2_constants_i8: 486; GCN: ds_write_b16 487; GCN: s_endpgm 488define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 { 489 %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1 490 491 store i8 123, i8 addrspace(3)* %out.gep.1 492 store i8 456, i8 addrspace(3)* %out, align 2 493 ret void 494} 495 496; GCN-LABEL: {{^}}merge_local_store_2_constants_i32: 497; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8 498; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b 499; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}} 500define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 { 501 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 502 503 store i32 123, i32 addrspace(3)* %out.gep.1 504 store i32 456, i32 addrspace(3)* %out 505 ret void 506} 507 508; GCN-LABEL: {{^}}merge_local_store_4_constants_i32: 509; GCN-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x1c8 510; GCN-DAG: v_mov_b32_e32 [[K3:v[0-9]+]], 0x14d 511; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K2]], [[K3]] offset0:2 offset1:3 512 513; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x4d2 514; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x7b 515; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1 516 517; GCN: s_endpgm 518define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { 519 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 520 %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2 521 %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3 522 523 store i32 123, i32 addrspace(3)* %out.gep.1 524 store i32 456, i32 addrspace(3)* %out.gep.2 525 store i32 333, i32 addrspace(3)* %out.gep.3 526 store i32 1234, i32 addrspace(3)* %out 527 ret void 528} 529 530; GCN-LABEL: {{^}}merge_global_store_5_constants_i32: 531; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}} 532; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}} 533; GCN: buffer_store_dwordx4 v[[[LO]]:[[HI4]]] 534; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}} 535; GCN: buffer_store_dword v[[HI]] 536define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) { 537 store i32 9, i32 addrspace(1)* %out, align 4 538 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 539 store i32 12, i32 addrspace(1)* %idx1, align 4 540 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 541 store i32 16, i32 addrspace(1)* %idx2, align 4 542 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 543 store i32 -12, i32 addrspace(1)* %idx3, align 4 544 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 545 store i32 11, i32 addrspace(1)* %idx4, align 4 546 ret void 547} 548 549; GCN-LABEL: {{^}}merge_global_store_6_constants_i32: 550; GCN: buffer_store_dwordx4 551; GCN: buffer_store_dwordx2 552define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) { 553 store i32 13, i32 addrspace(1)* %out, align 4 554 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 555 store i32 15, i32 addrspace(1)* %idx1, align 4 556 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 557 store i32 62, i32 addrspace(1)* %idx2, align 4 558 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 559 store i32 63, i32 addrspace(1)* %idx3, align 4 560 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 561 store i32 11, i32 addrspace(1)* %idx4, align 4 562 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 563 store i32 123, i32 addrspace(1)* %idx5, align 4 564 ret void 565} 566 567; GCN-LABEL: {{^}}merge_global_store_7_constants_i32: 568; GCN: buffer_store_dwordx4 569; SI-DAG: buffer_store_dwordx2 570; CI: buffer_store_dwordx3 571define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) { 572 store i32 34, i32 addrspace(1)* %out, align 4 573 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 574 store i32 999, i32 addrspace(1)* %idx1, align 4 575 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 576 store i32 65, i32 addrspace(1)* %idx2, align 4 577 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 578 store i32 33, i32 addrspace(1)* %idx3, align 4 579 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 580 store i32 98, i32 addrspace(1)* %idx4, align 4 581 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 582 store i32 91, i32 addrspace(1)* %idx5, align 4 583 %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6 584 store i32 212, i32 addrspace(1)* %idx6, align 4 585 ret void 586} 587 588; GCN-LABEL: {{^}}merge_global_store_8_constants_i32: 589; GCN: buffer_store_dwordx4 590; GCN: buffer_store_dwordx4 591; GCN: s_endpgm 592define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) { 593 store i32 34, i32 addrspace(1)* %out, align 4 594 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 595 store i32 999, i32 addrspace(1)* %idx1, align 4 596 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 597 store i32 65, i32 addrspace(1)* %idx2, align 4 598 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 599 store i32 33, i32 addrspace(1)* %idx3, align 4 600 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 601 store i32 98, i32 addrspace(1)* %idx4, align 4 602 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 603 store i32 91, i32 addrspace(1)* %idx5, align 4 604 %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6 605 store i32 212, i32 addrspace(1)* %idx6, align 4 606 %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7 607 store i32 999, i32 addrspace(1)* %idx7, align 4 608 ret void 609} 610 611; This requires handling of scalar_to_vector for v2i64 to avoid 612; scratch usage. 613; FIXME: Should do single load and store 614 615; GCN-LABEL: {{^}}copy_v3i32_align4: 616; GCN-NOT: SCRATCH_RSRC_DWORD 617; SI-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 618; SI-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 619; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 620; GCN-NOT: offen 621; GCN: s_waitcnt vmcnt 622; GCN-NOT: offen 623; SI-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 624; SI-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 625; CI-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 626 627; GCN: ScratchSize: 0{{$}} 628define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 { 629 %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4 630 store <3 x i32> %vec, <3 x i32> addrspace(1)* %out 631 ret void 632} 633 634; GCN-LABEL: {{^}}copy_v3i64_align4: 635; GCN-NOT: SCRATCH_RSRC_DWORD 636; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 637; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 638; GCN-NOT: offen 639; GCN: s_waitcnt vmcnt 640; GCN-NOT: offen 641; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 642; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 643; GCN: ScratchSize: 0{{$}} 644define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 { 645 %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4 646 store <3 x i64> %vec, <3 x i64> addrspace(1)* %out 647 ret void 648} 649 650; GCN-LABEL: {{^}}copy_v3f32_align4: 651; GCN-NOT: SCRATCH_RSRC_DWORD 652; SI-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 653; SI-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 654; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 655; GCN-NOT: offen 656; GCN: s_waitcnt vmcnt 657; GCN-NOT: offen 658; SI-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 659; SI-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 660; CI-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 661; GCN: ScratchSize: 0{{$}} 662define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 { 663 %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4 664 %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0> 665 store <3 x float> %fadd, <3 x float> addrspace(1)* %out 666 ret void 667} 668 669; GCN-LABEL: {{^}}copy_v3f64_align4: 670; GCN-NOT: SCRATCH_RSRC_DWORD 671; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 672; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 673; GCN-NOT: offen 674; GCN: s_waitcnt vmcnt 675; GCN-NOT: offen 676; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 677; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 678; GCN: ScratchSize: 0{{$}} 679define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 { 680 %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4 681 %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0> 682 store <3 x double> %fadd, <3 x double> addrspace(1)* %out 683 ret void 684} 685 686declare void @llvm.amdgcn.s.barrier() #1 687 688attributes #0 = { nounwind } 689attributes #1 = { convergent nounwind } 690