1; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s 2; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s 3 4; RUN: llc -march=amdgcn -verify-machineinstrs -combiner-alias-analysis -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s 5; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -combiner-alias-analysis -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s 6 7; This test is mostly to test DAG store merging, so disable the vectorizer. 8; Run with devices with different unaligned load restrictions. 9 10; TODO: Vector element tests 11; TODO: Non-zero base offset for load and store combinations 12; TODO: Same base addrspacecasted 13 14 15; GCN-LABEL: {{^}}merge_global_store_2_constants_i8: 16; GCN: buffer_store_byte 17; GCN: buffer_store_byte 18; GCN: s_endpgm 19define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 { 20 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 21 22 store i8 123, i8 addrspace(1)* %out.gep.1 23 store i8 456, i8 addrspace(1)* %out, align 2 24 ret void 25} 26 27; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align: 28; GCN: buffer_store_byte 29; GCN: buffer_store_byte 30; GCN: s_endpgm 31define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 { 32 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 33 34 store i8 123, i8 addrspace(1)* %out.gep.1 35 store i8 456, i8 addrspace(1)* %out 36 ret void 37} 38 39; GCN-LABEL: {{^}}merge_global_store_2_constants_i16: 40; GCN: buffer_store_dword v 41define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 { 42 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 43 44 store i16 123, i16 addrspace(1)* %out.gep.1 45 store i16 456, i16 addrspace(1)* %out, align 4 46 ret void 47} 48 49; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16: 50; GCN: buffer_store_dword v 51define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 { 52 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 53 54 store i16 0, i16 addrspace(1)* %out.gep.1 55 store i16 0, i16 addrspace(1)* %out, align 4 56 ret void 57} 58 59; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align: 60; GCN: buffer_store_short 61; GCN: buffer_store_short 62; GCN: s_endpgm 63define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 { 64 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 65 66 store i16 123, i16 addrspace(1)* %out.gep.1 67 store i16 456, i16 addrspace(1)* %out 68 ret void 69} 70 71; GCN-LABEL: {{^}}merge_global_store_2_constants_i32: 72; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8 73; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b 74; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} 75define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 { 76 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 77 78 store i32 123, i32 addrspace(1)* %out.gep.1 79 store i32 456, i32 addrspace(1)* %out 80 ret void 81} 82 83; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32: 84; GCN: buffer_store_dwordx2 85define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 { 86 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 87 %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)* 88 store float 1.0, float addrspace(1)* %out.gep.1.bc 89 store i32 456, i32 addrspace(1)* %out 90 ret void 91} 92 93; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32: 94; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0 95; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b 96; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} 97define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 { 98 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 99 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)* 100 store i32 123, i32 addrspace(1)* %out.gep.1.bc 101 store float 4.0, float addrspace(1)* %out 102 ret void 103} 104 105; GCN-LABEL: {{^}}merge_global_store_4_constants_i32: 106; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}} 107; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}} 108; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}} 109; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}} 110; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}} 111define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 { 112 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 113 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 114 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 115 116 store i32 123, i32 addrspace(1)* %out.gep.1 117 store i32 456, i32 addrspace(1)* %out.gep.2 118 store i32 333, i32 addrspace(1)* %out.gep.3 119 store i32 1234, i32 addrspace(1)* %out 120 ret void 121} 122 123; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order: 124; GCN: buffer_store_dwordx4 125define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 { 126 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 127 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 128 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 129 130 store float 8.0, float addrspace(1)* %out 131 store float 1.0, float addrspace(1)* %out.gep.1 132 store float 2.0, float addrspace(1)* %out.gep.2 133 store float 4.0, float addrspace(1)* %out.gep.3 134 ret void 135} 136 137; First store is out of order. 138; GCN-LABEL: {{^}}merge_global_store_4_constants_f32: 139; GCN: buffer_store_dwordx4 140define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 { 141 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 142 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 143 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 144 145 store float 1.0, float addrspace(1)* %out.gep.1 146 store float 2.0, float addrspace(1)* %out.gep.2 147 store float 4.0, float addrspace(1)* %out.gep.3 148 store float 8.0, float addrspace(1)* %out 149 ret void 150} 151 152; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32: 153; GCN-NOAA: buffer_store_dwordx4 v 154 155; GCN-AA: buffer_store_dwordx2 156; GCN-AA: buffer_store_dword v 157; GCN-AA: buffer_store_dword v 158 159; GCN: s_endpgm 160define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 { 161 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 162 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 163 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 164 165 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)* 166 %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)* 167 168 store i32 11, i32 addrspace(1)* %out.gep.1.bc 169 store float 2.0, float addrspace(1)* %out.gep.2 170 store i32 17, i32 addrspace(1)* %out.gep.3.bc 171 store float 8.0, float addrspace(1)* %out 172 ret void 173} 174 175; GCN-LABEL: {{^}}merge_global_store_3_constants_i32: 176; SI-DAG: buffer_store_dwordx2 177; SI-DAG: buffer_store_dword 178; SI-NOT: buffer_store_dword 179; GCN: s_endpgm 180define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 { 181 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 182 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 183 184 store i32 123, i32 addrspace(1)* %out.gep.1 185 store i32 456, i32 addrspace(1)* %out.gep.2 186 store i32 1234, i32 addrspace(1)* %out 187 ret void 188} 189 190; GCN-LABEL: {{^}}merge_global_store_2_constants_i64: 191; GCN: buffer_store_dwordx4 192define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 { 193 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 194 195 store i64 123, i64 addrspace(1)* %out.gep.1 196 store i64 456, i64 addrspace(1)* %out 197 ret void 198} 199 200; GCN-LABEL: {{^}}merge_global_store_4_constants_i64: 201; GCN: buffer_store_dwordx4 202; GCN: buffer_store_dwordx4 203define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 { 204 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 205 %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2 206 %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3 207 208 store i64 123, i64 addrspace(1)* %out.gep.1 209 store i64 456, i64 addrspace(1)* %out.gep.2 210 store i64 333, i64 addrspace(1)* %out.gep.3 211 store i64 1234, i64 addrspace(1)* %out 212 ret void 213} 214 215; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32: 216; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] 217; GCN: buffer_store_dwordx2 [[LOAD]] 218define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 219 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 220 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 221 222 %lo = load i32, i32 addrspace(1)* %in 223 %hi = load i32, i32 addrspace(1)* %in.gep.1 224 225 store i32 %lo, i32 addrspace(1)* %out 226 store i32 %hi, i32 addrspace(1)* %out.gep.1 227 ret void 228} 229 230; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base: 231; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 232; GCN: buffer_store_dwordx2 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 233define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 234 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2 235 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3 236 237 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2 238 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3 239 %lo = load i32, i32 addrspace(1)* %in.gep.0 240 %hi = load i32, i32 addrspace(1)* %in.gep.1 241 242 store i32 %lo, i32 addrspace(1)* %out.gep.0 243 store i32 %hi, i32 addrspace(1)* %out.gep.1 244 ret void 245} 246 247; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32: 248; GCN: buffer_load_dword v 249; GCN: buffer_load_dword v 250; GCN: buffer_store_dword v 251; GCN: buffer_store_dword v 252define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 253 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 254 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 255 256 %lo = load i32, i32 addrspace(1)* %in 257 %hi = load i32, i32 addrspace(1)* %in.gep.1 258 259 store i32 %hi, i32 addrspace(1)* %out 260 store i32 %lo, i32 addrspace(1)* %out.gep.1 261 ret void 262} 263 264; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32: 265; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 266; GCN: buffer_store_dwordx4 [[LOAD]] 267define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 268 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 269 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 270 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 271 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 272 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 273 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 274 275 %x = load i32, i32 addrspace(1)* %in 276 %y = load i32, i32 addrspace(1)* %in.gep.1 277 %z = load i32, i32 addrspace(1)* %in.gep.2 278 %w = load i32, i32 addrspace(1)* %in.gep.3 279 280 store i32 %x, i32 addrspace(1)* %out 281 store i32 %y, i32 addrspace(1)* %out.gep.1 282 store i32 %z, i32 addrspace(1)* %out.gep.2 283 store i32 %w, i32 addrspace(1)* %out.gep.3 284 ret void 285} 286 287; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32: 288; SI-DAG: buffer_load_dwordx2 289; SI-DAG: buffer_load_dword v 290; GCN: s_waitcnt 291; SI-DAG: buffer_store_dword v 292; SI-DAG: buffer_store_dwordx2 v 293; GCN: s_endpgm 294define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 295 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 296 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 297 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 298 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 299 300 %x = load i32, i32 addrspace(1)* %in 301 %y = load i32, i32 addrspace(1)* %in.gep.1 302 %z = load i32, i32 addrspace(1)* %in.gep.2 303 304 store i32 %x, i32 addrspace(1)* %out 305 store i32 %y, i32 addrspace(1)* %out.gep.1 306 store i32 %z, i32 addrspace(1)* %out.gep.2 307 ret void 308} 309 310; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32: 311; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 312; GCN: buffer_store_dwordx4 [[LOAD]] 313define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 314 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 315 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 316 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 317 %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1 318 %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2 319 %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3 320 321 %x = load float, float addrspace(1)* %in 322 %y = load float, float addrspace(1)* %in.gep.1 323 %z = load float, float addrspace(1)* %in.gep.2 324 %w = load float, float addrspace(1)* %in.gep.3 325 326 store float %x, float addrspace(1)* %out 327 store float %y, float addrspace(1)* %out.gep.1 328 store float %z, float addrspace(1)* %out.gep.2 329 store float %w, float addrspace(1)* %out.gep.3 330 ret void 331} 332 333; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base: 334; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 335; GCN: buffer_store_dwordx4 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28 336define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 337 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11 338 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12 339 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13 340 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14 341 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7 342 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8 343 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9 344 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10 345 346 %x = load i32, i32 addrspace(1)* %in.gep.0 347 %y = load i32, i32 addrspace(1)* %in.gep.1 348 %z = load i32, i32 addrspace(1)* %in.gep.2 349 %w = load i32, i32 addrspace(1)* %in.gep.3 350 351 store i32 %x, i32 addrspace(1)* %out.gep.0 352 store i32 %y, i32 addrspace(1)* %out.gep.1 353 store i32 %z, i32 addrspace(1)* %out.gep.2 354 store i32 %w, i32 addrspace(1)* %out.gep.3 355 ret void 356} 357 358; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32: 359; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 360; GCN: s_barrier 361; GCN: buffer_store_dwordx4 [[LOAD]] 362define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 363 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 364 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 365 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 366 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 367 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 368 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 369 370 %x = load i32, i32 addrspace(1)* %in 371 %y = load i32, i32 addrspace(1)* %in.gep.1 372 %z = load i32, i32 addrspace(1)* %in.gep.2 373 %w = load i32, i32 addrspace(1)* %in.gep.3 374 375 ; Make sure the barrier doesn't stop this 376 tail call void @llvm.amdgcn.s.barrier() #1 377 378 store i32 %w, i32 addrspace(1)* %out.gep.3 379 store i32 %z, i32 addrspace(1)* %out.gep.2 380 store i32 %y, i32 addrspace(1)* %out.gep.1 381 store i32 %x, i32 addrspace(1)* %out 382 383 ret void 384} 385 386; TODO: Re-packing of loaded register required. Maybe an IR pass 387; should catch this? 388 389; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32: 390; GCN: buffer_load_dword v 391; GCN: buffer_load_dword v 392; GCN: buffer_load_dword v 393; GCN: buffer_load_dword v 394; GCN: s_barrier 395; GCN: buffer_store_dword v 396; GCN: buffer_store_dword v 397; GCN: buffer_store_dword v 398; GCN: buffer_store_dword v 399define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 400 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 401 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 402 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 403 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 404 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 405 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 406 407 %x = load i32, i32 addrspace(1)* %in 408 %y = load i32, i32 addrspace(1)* %in.gep.1 409 %z = load i32, i32 addrspace(1)* %in.gep.2 410 %w = load i32, i32 addrspace(1)* %in.gep.3 411 412 ; Make sure the barrier doesn't stop this 413 tail call void @llvm.amdgcn.s.barrier() #1 414 415 store i32 %w, i32 addrspace(1)* %out 416 store i32 %z, i32 addrspace(1)* %out.gep.1 417 store i32 %y, i32 addrspace(1)* %out.gep.2 418 store i32 %x, i32 addrspace(1)* %out.gep.3 419 420 ret void 421} 422 423; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8: 424; GCN: buffer_load_dword [[LOAD:v[0-9]+]] 425; GCN: buffer_store_dword [[LOAD]] 426; GCN: s_endpgm 427define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { 428 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 429 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 430 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 431 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1 432 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2 433 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3 434 435 %x = load i8, i8 addrspace(1)* %in, align 4 436 %y = load i8, i8 addrspace(1)* %in.gep.1 437 %z = load i8, i8 addrspace(1)* %in.gep.2 438 %w = load i8, i8 addrspace(1)* %in.gep.3 439 440 store i8 %x, i8 addrspace(1)* %out, align 4 441 store i8 %y, i8 addrspace(1)* %out.gep.1 442 store i8 %z, i8 addrspace(1)* %out.gep.2 443 store i8 %w, i8 addrspace(1)* %out.gep.3 444 ret void 445} 446 447; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align: 448; GCN: buffer_load_ubyte 449; GCN: buffer_load_ubyte 450; GCN: buffer_load_ubyte 451; GCN: buffer_load_ubyte 452; GCN: buffer_store_byte 453; GCN: buffer_store_byte 454; GCN: buffer_store_byte 455; GCN: buffer_store_byte 456; GCN: s_endpgm 457define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { 458 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 459 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 460 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 461 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1 462 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2 463 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3 464 465 %x = load i8, i8 addrspace(1)* %in 466 %y = load i8, i8 addrspace(1)* %in.gep.1 467 %z = load i8, i8 addrspace(1)* %in.gep.2 468 %w = load i8, i8 addrspace(1)* %in.gep.3 469 470 store i8 %x, i8 addrspace(1)* %out 471 store i8 %y, i8 addrspace(1)* %out.gep.1 472 store i8 %z, i8 addrspace(1)* %out.gep.2 473 store i8 %w, i8 addrspace(1)* %out.gep.3 474 ret void 475} 476 477; This works once AA is enabled on the subtarget 478; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32: 479; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 480 481; GCN-NOAA: buffer_store_dword v 482; GCN-NOAA: buffer_store_dword v 483; GCN-NOAA: buffer_store_dword v 484; GCN-NOAA: buffer_store_dword v 485 486; GCN-AA: buffer_store_dwordx4 [[LOAD]] 487 488; GCN: s_endpgm 489define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { 490 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 491 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 492 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 493 %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in 494 495 %x = extractelement <4 x i32> %vec, i32 0 496 %y = extractelement <4 x i32> %vec, i32 1 497 %z = extractelement <4 x i32> %vec, i32 2 498 %w = extractelement <4 x i32> %vec, i32 3 499 500 store i32 %x, i32 addrspace(1)* %out 501 store i32 %y, i32 addrspace(1)* %out.gep.1 502 store i32 %z, i32 addrspace(1)* %out.gep.2 503 store i32 %w, i32 addrspace(1)* %out.gep.3 504 ret void 505} 506 507; GCN-LABEL: {{^}}merge_local_store_2_constants_i8: 508; GCN: ds_write_b8 509; GCN: ds_write_b8 510; GCN: s_endpgm 511define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 { 512 %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1 513 514 store i8 123, i8 addrspace(3)* %out.gep.1 515 store i8 456, i8 addrspace(3)* %out, align 2 516 ret void 517} 518 519; GCN-LABEL: {{^}}merge_local_store_2_constants_i32: 520; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8 521; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b 522; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}} 523define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 { 524 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 525 526 store i32 123, i32 addrspace(3)* %out.gep.1 527 store i32 456, i32 addrspace(3)* %out 528 ret void 529} 530 531; GCN-LABEL: {{^}}merge_local_store_4_constants_i32: 532; GCN-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x1c8 533; GCN-DAG: v_mov_b32_e32 [[K3:v[0-9]+]], 0x14d 534; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K2]], [[K3]] offset0:2 offset1:3 535 536; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x4d2 537; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x7b 538; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1 539 540; GCN: s_endpgm 541define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { 542 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 543 %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2 544 %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3 545 546 store i32 123, i32 addrspace(3)* %out.gep.1 547 store i32 456, i32 addrspace(3)* %out.gep.2 548 store i32 333, i32 addrspace(3)* %out.gep.3 549 store i32 1234, i32 addrspace(3)* %out 550 ret void 551} 552 553; GCN-LABEL: {{^}}merge_global_store_5_constants_i32: 554; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}} 555; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}} 556; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}} 557; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}} 558; GCN: buffer_store_dword v[[HI]] 559define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) { 560 store i32 9, i32 addrspace(1)* %out, align 4 561 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 562 store i32 12, i32 addrspace(1)* %idx1, align 4 563 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 564 store i32 16, i32 addrspace(1)* %idx2, align 4 565 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 566 store i32 -12, i32 addrspace(1)* %idx3, align 4 567 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 568 store i32 11, i32 addrspace(1)* %idx4, align 4 569 ret void 570} 571 572; GCN-LABEL: {{^}}merge_global_store_6_constants_i32: 573; GCN: buffer_store_dwordx4 574; GCN: buffer_store_dwordx2 575define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) { 576 store i32 13, i32 addrspace(1)* %out, align 4 577 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 578 store i32 15, i32 addrspace(1)* %idx1, align 4 579 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 580 store i32 62, i32 addrspace(1)* %idx2, align 4 581 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 582 store i32 63, i32 addrspace(1)* %idx3, align 4 583 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 584 store i32 11, i32 addrspace(1)* %idx4, align 4 585 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 586 store i32 123, i32 addrspace(1)* %idx5, align 4 587 ret void 588} 589 590; GCN-LABEL: {{^}}merge_global_store_7_constants_i32: 591; GCN: buffer_store_dwordx4 592; GCN: buffer_store_dwordx2 593; GCN: buffer_store_dword v 594define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) { 595 store i32 34, i32 addrspace(1)* %out, align 4 596 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 597 store i32 999, i32 addrspace(1)* %idx1, align 4 598 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 599 store i32 65, i32 addrspace(1)* %idx2, align 4 600 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 601 store i32 33, i32 addrspace(1)* %idx3, align 4 602 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 603 store i32 98, i32 addrspace(1)* %idx4, align 4 604 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 605 store i32 91, i32 addrspace(1)* %idx5, align 4 606 %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6 607 store i32 212, i32 addrspace(1)* %idx6, align 4 608 ret void 609} 610 611; GCN-LABEL: {{^}}merge_global_store_8_constants_i32: 612; GCN: buffer_store_dwordx4 613; GCN: buffer_store_dwordx4 614; GCN: s_endpgm 615define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) { 616 store i32 34, i32 addrspace(1)* %out, align 4 617 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 618 store i32 999, i32 addrspace(1)* %idx1, align 4 619 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 620 store i32 65, i32 addrspace(1)* %idx2, align 4 621 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 622 store i32 33, i32 addrspace(1)* %idx3, align 4 623 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 624 store i32 98, i32 addrspace(1)* %idx4, align 4 625 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 626 store i32 91, i32 addrspace(1)* %idx5, align 4 627 %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6 628 store i32 212, i32 addrspace(1)* %idx6, align 4 629 %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7 630 store i32 999, i32 addrspace(1)* %idx7, align 4 631 ret void 632} 633 634; This requires handling of scalar_to_vector for v2i64 to avoid 635; scratch usage. 636; FIXME: Should do single load and store 637 638; GCN-LABEL: {{^}}copy_v3i32_align4: 639; GCN-NOT: SCRATCH_RSRC_DWORD 640; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 641; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 642; GCN-NOT: offen 643; GCN: s_waitcnt vmcnt 644; GCN-NOT: offen 645; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 646; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 647 648; GCN: ScratchSize: 0{{$}} 649define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 { 650 %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4 651 store <3 x i32> %vec, <3 x i32> addrspace(1)* %out 652 ret void 653} 654 655; GCN-LABEL: {{^}}copy_v3i64_align4: 656; GCN-NOT: SCRATCH_RSRC_DWORD 657; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 658; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 659; GCN-NOT: offen 660; GCN: s_waitcnt vmcnt 661; GCN-NOT: offen 662; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 663; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 664; GCN: ScratchSize: 0{{$}} 665define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 { 666 %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4 667 store <3 x i64> %vec, <3 x i64> addrspace(1)* %out 668 ret void 669} 670 671; GCN-LABEL: {{^}}copy_v3f32_align4: 672; GCN-NOT: SCRATCH_RSRC_DWORD 673; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 674; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 675; GCN-NOT: offen 676; GCN: s_waitcnt vmcnt 677; GCN-NOT: offen 678; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 679; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 680; GCN: ScratchSize: 0{{$}} 681define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 { 682 %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4 683 %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0> 684 store <3 x float> %fadd, <3 x float> addrspace(1)* %out 685 ret void 686} 687 688; GCN-LABEL: {{^}}copy_v3f64_align4: 689; GCN-NOT: SCRATCH_RSRC_DWORD 690; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 691; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 692; GCN-NOT: offen 693; GCN: s_waitcnt vmcnt 694; GCN-NOT: offen 695; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 696; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 697; GCN: ScratchSize: 0{{$}} 698define void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 { 699 %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4 700 %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0> 701 store <3 x double> %fadd, <3 x double> addrspace(1)* %out 702 ret void 703} 704 705declare void @llvm.amdgcn.s.barrier() #1 706 707attributes #0 = { nounwind } 708attributes #1 = { convergent nounwind } 709