1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update 2; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope --check-prefix=CI %s 3; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-ALIGNED %s 4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-UNALIGNED %s 5 6@lds = addrspace(3) global [512 x float] undef, align 4 7@lds.f64 = addrspace(3) global [512 x double] undef, align 8 8 9define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { 10; CI-LABEL: simple_write2_one_val_f32: 11; CI: ; %bb.0: 12; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 13; CI-NEXT: s_mov_b32 s3, 0xf000 14; CI-NEXT: s_mov_b32 s2, 0 15; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 16; CI-NEXT: v_mov_b32_e32 v1, 0 17; CI-NEXT: s_waitcnt lgkmcnt(0) 18; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 19; CI-NEXT: s_mov_b32 m0, -1 20; CI-NEXT: s_waitcnt vmcnt(0) 21; CI-NEXT: ds_write2_b32 v0, v1, v1 offset1:8 22; CI-NEXT: s_endpgm 23; 24; GFX9-LABEL: simple_write2_one_val_f32: 25; GFX9: ; %bb.0: 26; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 27; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 28; GFX9-NEXT: s_waitcnt lgkmcnt(0) 29; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 30; GFX9-NEXT: s_waitcnt vmcnt(0) 31; GFX9-NEXT: ds_write2_b32 v0, v1, v1 offset1:8 32; GFX9-NEXT: s_endpgm 33 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 34 %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i 35 %val = load float, float addrspace(1)* %in.gep, align 4 36 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 37 store float %val, float addrspace(3)* %arrayidx0, align 4 38 %add.x = add nsw i32 %x.i, 8 39 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 40 store float %val, float addrspace(3)* %arrayidx1, align 4 41 ret void 42} 43 44define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { 45; CI-LABEL: simple_write2_two_val_f32: 46; CI: ; %bb.0: 47; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 48; CI-NEXT: s_mov_b32 s3, 0xf000 49; CI-NEXT: s_mov_b32 s2, 0 50; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 51; CI-NEXT: v_mov_b32_e32 v1, 0 52; CI-NEXT: s_waitcnt lgkmcnt(0) 53; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc 54; CI-NEXT: s_waitcnt vmcnt(0) 55; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc 56; CI-NEXT: s_waitcnt vmcnt(0) 57; CI-NEXT: s_mov_b32 m0, -1 58; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:8 59; CI-NEXT: s_endpgm 60; 61; GFX9-LABEL: simple_write2_two_val_f32: 62; GFX9: ; %bb.0: 63; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 64; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 65; GFX9-NEXT: s_waitcnt lgkmcnt(0) 66; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc 67; GFX9-NEXT: s_waitcnt vmcnt(0) 68; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc 69; GFX9-NEXT: s_waitcnt vmcnt(0) 70; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8 71; GFX9-NEXT: s_endpgm 72 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 73 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i 74 %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 75 %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4 76 %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4 77 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 78 store float %val0, float addrspace(3)* %arrayidx0, align 4 79 %add.x = add nsw i32 %x.i, 8 80 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 81 store float %val1, float addrspace(3)* %arrayidx1, align 4 82 ret void 83} 84 85define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 86; CI-LABEL: simple_write2_two_val_f32_volatile_0: 87; CI: ; %bb.0: 88; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 89; CI-NEXT: s_mov_b32 s3, 0xf000 90; CI-NEXT: s_mov_b32 s2, 0 91; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 92; CI-NEXT: v_mov_b32_e32 v1, 0 93; CI-NEXT: s_waitcnt lgkmcnt(0) 94; CI-NEXT: s_mov_b64 s[0:1], s[4:5] 95; CI-NEXT: s_mov_b64 s[4:5], s[6:7] 96; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 97; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc 98; CI-NEXT: s_waitcnt vmcnt(0) 99; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 glc 100; CI-NEXT: s_waitcnt vmcnt(0) 101; CI-NEXT: s_mov_b32 m0, -1 102; CI-NEXT: ds_write_b32 v0, v2 103; CI-NEXT: ds_write_b32 v0, v1 offset:32 104; CI-NEXT: s_endpgm 105; 106; GFX9-LABEL: simple_write2_two_val_f32_volatile_0: 107; GFX9: ; %bb.0: 108; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 109; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 110; GFX9-NEXT: s_waitcnt lgkmcnt(0) 111; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc 112; GFX9-NEXT: s_waitcnt vmcnt(0) 113; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc 114; GFX9-NEXT: s_waitcnt vmcnt(0) 115; GFX9-NEXT: ds_write_b32 v0, v1 116; GFX9-NEXT: ds_write_b32 v0, v2 offset:32 117; GFX9-NEXT: s_endpgm 118 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 119 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 120 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 121 %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4 122 %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4 123 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 124 store volatile float %val0, float addrspace(3)* %arrayidx0, align 4 125 %add.x = add nsw i32 %x.i, 8 126 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 127 store float %val1, float addrspace(3)* %arrayidx1, align 4 128 ret void 129} 130 131define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 132; CI-LABEL: simple_write2_two_val_f32_volatile_1: 133; CI: ; %bb.0: 134; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 135; CI-NEXT: s_mov_b32 s3, 0xf000 136; CI-NEXT: s_mov_b32 s2, 0 137; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 138; CI-NEXT: v_mov_b32_e32 v1, 0 139; CI-NEXT: s_waitcnt lgkmcnt(0) 140; CI-NEXT: s_mov_b64 s[0:1], s[4:5] 141; CI-NEXT: s_mov_b64 s[4:5], s[6:7] 142; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 143; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc 144; CI-NEXT: s_waitcnt vmcnt(0) 145; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 glc 146; CI-NEXT: s_waitcnt vmcnt(0) 147; CI-NEXT: s_mov_b32 m0, -1 148; CI-NEXT: ds_write_b32 v0, v2 149; CI-NEXT: ds_write_b32 v0, v1 offset:32 150; CI-NEXT: s_endpgm 151; 152; GFX9-LABEL: simple_write2_two_val_f32_volatile_1: 153; GFX9: ; %bb.0: 154; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 155; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 156; GFX9-NEXT: s_waitcnt lgkmcnt(0) 157; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc 158; GFX9-NEXT: s_waitcnt vmcnt(0) 159; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc 160; GFX9-NEXT: s_waitcnt vmcnt(0) 161; GFX9-NEXT: ds_write_b32 v0, v1 162; GFX9-NEXT: ds_write_b32 v0, v2 offset:32 163; GFX9-NEXT: s_endpgm 164 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 165 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 166 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 167 %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4 168 %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4 169 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 170 store float %val0, float addrspace(3)* %arrayidx0, align 4 171 %add.x = add nsw i32 %x.i, 8 172 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 173 store volatile float %val1, float addrspace(3)* %arrayidx1, align 4 174 ret void 175} 176 177; 2 data subregisters from different super registers. 178; TODO: GFX9 has v_mov_b32_e32 v2, lds@abs32@lo 179; This should be an s_mov_b32. The v_mov_b32 gets introduced by an 180; early legalization of the constant bus constraint on the v_lshl_add_u32, 181; and then SIFoldOperands folds in an unlucky order. 182define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { 183; CI-LABEL: simple_write2_two_val_subreg2_mixed_f32: 184; CI: ; %bb.0: 185; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 186; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 187; CI-NEXT: s_mov_b32 s3, 0xf000 188; CI-NEXT: s_mov_b32 s2, 0 189; CI-NEXT: v_mov_b32_e32 v2, 0 190; CI-NEXT: s_waitcnt lgkmcnt(0) 191; CI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64 glc 192; CI-NEXT: s_waitcnt vmcnt(0) 193; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 offset:8 glc 194; CI-NEXT: s_waitcnt vmcnt(0) 195; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 196; CI-NEXT: s_mov_b32 m0, -1 197; CI-NEXT: ds_write2_b32 v0, v3, v2 offset1:8 198; CI-NEXT: s_endpgm 199; 200; GFX9-LABEL: simple_write2_two_val_subreg2_mixed_f32: 201; GFX9: ; %bb.0: 202; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 203; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 204; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 205; GFX9-NEXT: ; kill: killed $vgpr4 206; GFX9-NEXT: ; kill: killed $sgpr0_sgpr1 207; GFX9-NEXT: s_waitcnt lgkmcnt(0) 208; GFX9-NEXT: global_load_dwordx2 v[1:2], v4, s[0:1] glc 209; GFX9-NEXT: s_waitcnt vmcnt(0) 210; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc 211; GFX9-NEXT: s_waitcnt vmcnt(0) 212; GFX9-NEXT: ds_write2_b32 v0, v1, v3 offset1:8 213; GFX9-NEXT: s_endpgm 214 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 215 %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i 216 %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1 217 %val0 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8 218 %val1 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8 219 %val0.0 = extractelement <2 x float> %val0, i32 0 220 %val1.1 = extractelement <2 x float> %val1, i32 1 221 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 222 store float %val0.0, float addrspace(3)* %arrayidx0, align 4 223 %add.x = add nsw i32 %x.i, 8 224 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 225 store float %val1.1, float addrspace(3)* %arrayidx1, align 4 226 ret void 227} 228 229define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { 230; CI-LABEL: simple_write2_two_val_subreg2_f32: 231; CI: ; %bb.0: 232; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 233; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 234; CI-NEXT: s_mov_b32 s3, 0xf000 235; CI-NEXT: s_mov_b32 s2, 0 236; CI-NEXT: v_mov_b32_e32 v2, 0 237; CI-NEXT: s_waitcnt lgkmcnt(0) 238; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 239; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 240; CI-NEXT: s_mov_b32 m0, -1 241; CI-NEXT: s_waitcnt vmcnt(0) 242; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:8 243; CI-NEXT: s_endpgm 244; 245; GFX9-LABEL: simple_write2_two_val_subreg2_f32: 246; GFX9: ; %bb.0: 247; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 248; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v0 249; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 250; GFX9-NEXT: s_waitcnt lgkmcnt(0) 251; GFX9-NEXT: global_load_dwordx2 v[1:2], v1, s[0:1] 252; GFX9-NEXT: s_waitcnt vmcnt(0) 253; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8 254; GFX9-NEXT: s_endpgm 255 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 256 %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i 257 %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8 258 %val0 = extractelement <2 x float> %val, i32 0 259 %val1 = extractelement <2 x float> %val, i32 1 260 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 261 store float %val0, float addrspace(3)* %arrayidx0, align 4 262 %add.x = add nsw i32 %x.i, 8 263 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 264 store float %val1, float addrspace(3)* %arrayidx1, align 4 265 ret void 266} 267 268define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 { 269; CI-LABEL: simple_write2_two_val_subreg4_f32: 270; CI: ; %bb.0: 271; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 272; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v0 273; CI-NEXT: s_mov_b32 s3, 0xf000 274; CI-NEXT: s_mov_b32 s2, 0 275; CI-NEXT: v_mov_b32_e32 v2, 0 276; CI-NEXT: s_waitcnt lgkmcnt(0) 277; CI-NEXT: buffer_load_dwordx4 v[1:4], v[1:2], s[0:3], 0 addr64 278; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 279; CI-NEXT: s_mov_b32 m0, -1 280; CI-NEXT: s_waitcnt vmcnt(0) 281; CI-NEXT: ds_write2_b32 v0, v1, v4 offset1:8 282; CI-NEXT: s_endpgm 283; 284; GFX9-LABEL: simple_write2_two_val_subreg4_f32: 285; GFX9: ; %bb.0: 286; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 287; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v0 288; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 289; GFX9-NEXT: s_waitcnt lgkmcnt(0) 290; GFX9-NEXT: global_load_dwordx4 v[1:4], v1, s[0:1] 291; GFX9-NEXT: s_waitcnt vmcnt(0) 292; GFX9-NEXT: ds_write2_b32 v0, v1, v4 offset1:8 293; GFX9-NEXT: s_endpgm 294 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 295 %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i 296 %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16 297 %val0 = extractelement <4 x float> %val, i32 0 298 %val1 = extractelement <4 x float> %val, i32 3 299 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 300 store float %val0, float addrspace(3)* %arrayidx0, align 4 301 %add.x = add nsw i32 %x.i, 8 302 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 303 store float %val1, float addrspace(3)* %arrayidx1, align 4 304 ret void 305} 306 307define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { 308; CI-LABEL: simple_write2_two_val_max_offset_f32: 309; CI: ; %bb.0: 310; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 311; CI-NEXT: s_mov_b32 s3, 0xf000 312; CI-NEXT: s_mov_b32 s2, 0 313; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 314; CI-NEXT: v_mov_b32_e32 v1, 0 315; CI-NEXT: s_waitcnt lgkmcnt(0) 316; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc 317; CI-NEXT: s_waitcnt vmcnt(0) 318; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc 319; CI-NEXT: s_waitcnt vmcnt(0) 320; CI-NEXT: s_mov_b32 m0, -1 321; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:255 322; CI-NEXT: s_endpgm 323; 324; GFX9-LABEL: simple_write2_two_val_max_offset_f32: 325; GFX9: ; %bb.0: 326; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 327; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 328; GFX9-NEXT: s_waitcnt lgkmcnt(0) 329; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc 330; GFX9-NEXT: s_waitcnt vmcnt(0) 331; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc 332; GFX9-NEXT: s_waitcnt vmcnt(0) 333; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:255 334; GFX9-NEXT: s_endpgm 335 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 336 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i 337 %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 338 %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4 339 %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4 340 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 341 store float %val0, float addrspace(3)* %arrayidx0, align 4 342 %add.x = add nsw i32 %x.i, 255 343 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 344 store float %val1, float addrspace(3)* %arrayidx1, align 4 345 ret void 346} 347 348define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 349; CI-LABEL: simple_write2_two_val_too_far_f32: 350; CI: ; %bb.0: 351; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 352; CI-NEXT: s_mov_b32 s3, 0xf000 353; CI-NEXT: s_mov_b32 s2, 0 354; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 355; CI-NEXT: v_mov_b32_e32 v1, 0 356; CI-NEXT: s_waitcnt lgkmcnt(0) 357; CI-NEXT: s_mov_b64 s[0:1], s[4:5] 358; CI-NEXT: s_mov_b64 s[4:5], s[6:7] 359; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 360; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 361; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 362; CI-NEXT: s_mov_b32 m0, -1 363; CI-NEXT: s_waitcnt vmcnt(1) 364; CI-NEXT: ds_write_b32 v0, v2 365; CI-NEXT: s_waitcnt vmcnt(0) 366; CI-NEXT: ds_write_b32 v0, v1 offset:1028 367; CI-NEXT: s_endpgm 368; 369; GFX9-LABEL: simple_write2_two_val_too_far_f32: 370; GFX9: ; %bb.0: 371; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 372; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 373; GFX9-NEXT: s_waitcnt lgkmcnt(0) 374; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 375; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 376; GFX9-NEXT: s_waitcnt vmcnt(1) 377; GFX9-NEXT: ds_write_b32 v0, v1 378; GFX9-NEXT: s_waitcnt vmcnt(0) 379; GFX9-NEXT: ds_write_b32 v0, v2 offset:1028 380; GFX9-NEXT: s_endpgm 381 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 382 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 383 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 384 %val0 = load float, float addrspace(1)* %in0.gep, align 4 385 %val1 = load float, float addrspace(1)* %in1.gep, align 4 386 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 387 store float %val0, float addrspace(3)* %arrayidx0, align 4 388 %add.x = add nsw i32 %x.i, 257 389 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 390 store float %val1, float addrspace(3)* %arrayidx1, align 4 391 ret void 392} 393 394define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 395; CI-LABEL: simple_write2_two_val_f32_x2: 396; CI: ; %bb.0: 397; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 398; CI-NEXT: s_mov_b32 s3, 0xf000 399; CI-NEXT: s_mov_b32 s2, 0 400; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 401; CI-NEXT: v_mov_b32_e32 v1, 0 402; CI-NEXT: s_waitcnt lgkmcnt(0) 403; CI-NEXT: s_mov_b64 s[0:1], s[4:5] 404; CI-NEXT: s_mov_b64 s[4:5], s[6:7] 405; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 406; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 407; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 408; CI-NEXT: s_mov_b32 m0, -1 409; CI-NEXT: s_waitcnt vmcnt(0) 410; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:8 411; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:11 offset1:27 412; CI-NEXT: s_endpgm 413; 414; GFX9-LABEL: simple_write2_two_val_f32_x2: 415; GFX9: ; %bb.0: 416; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 417; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 418; GFX9-NEXT: s_waitcnt lgkmcnt(0) 419; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 420; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 421; GFX9-NEXT: s_waitcnt vmcnt(0) 422; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8 423; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27 424; GFX9-NEXT: s_endpgm 425 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 426 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x 427 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x 428 %val0 = load float, float addrspace(1)* %in0.gep, align 4 429 %val1 = load float, float addrspace(1)* %in1.gep, align 4 430 431 %idx.0 = add nsw i32 %tid.x, 0 432 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 433 store float %val0, float addrspace(3)* %arrayidx0, align 4 434 435 %idx.1 = add nsw i32 %tid.x, 8 436 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 437 store float %val1, float addrspace(3)* %arrayidx1, align 4 438 439 %idx.2 = add nsw i32 %tid.x, 11 440 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 441 store float %val0, float addrspace(3)* %arrayidx2, align 4 442 443 %idx.3 = add nsw i32 %tid.x, 27 444 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 445 store float %val1, float addrspace(3)* %arrayidx3, align 4 446 447 ret void 448} 449 450define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 451; CI-LABEL: simple_write2_two_val_f32_x2_nonzero_base: 452; CI: ; %bb.0: 453; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 454; CI-NEXT: s_mov_b32 s3, 0xf000 455; CI-NEXT: s_mov_b32 s2, 0 456; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 457; CI-NEXT: v_mov_b32_e32 v1, 0 458; CI-NEXT: s_waitcnt lgkmcnt(0) 459; CI-NEXT: s_mov_b64 s[0:1], s[4:5] 460; CI-NEXT: s_mov_b64 s[4:5], s[6:7] 461; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 462; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 463; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 464; CI-NEXT: s_mov_b32 m0, -1 465; CI-NEXT: s_waitcnt vmcnt(0) 466; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:3 offset1:8 467; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:11 offset1:27 468; CI-NEXT: s_endpgm 469; 470; GFX9-LABEL: simple_write2_two_val_f32_x2_nonzero_base: 471; GFX9: ; %bb.0: 472; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 473; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 474; GFX9-NEXT: s_waitcnt lgkmcnt(0) 475; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 476; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 477; GFX9-NEXT: s_waitcnt vmcnt(0) 478; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:8 479; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27 480; GFX9-NEXT: s_endpgm 481 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 482 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x 483 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x 484 %val0 = load float, float addrspace(1)* %in0.gep, align 4 485 %val1 = load float, float addrspace(1)* %in1.gep, align 4 486 487 %idx.0 = add nsw i32 %tid.x, 3 488 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 489 store float %val0, float addrspace(3)* %arrayidx0, align 4 490 491 %idx.1 = add nsw i32 %tid.x, 8 492 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 493 store float %val1, float addrspace(3)* %arrayidx1, align 4 494 495 %idx.2 = add nsw i32 %tid.x, 11 496 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 497 store float %val0, float addrspace(3)* %arrayidx2, align 4 498 499 %idx.3 = add nsw i32 %tid.x, 27 500 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 501 store float %val1, float addrspace(3)* %arrayidx3, align 4 502 503 ret void 504} 505 506define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 { 507; CI-LABEL: write2_ptr_subreg_arg_two_val_f32: 508; CI: ; %bb.0: 509; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 510; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf 511; CI-NEXT: s_mov_b32 s3, 0xf000 512; CI-NEXT: s_mov_b32 s2, 0 513; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 514; CI-NEXT: s_waitcnt lgkmcnt(0) 515; CI-NEXT: s_mov_b64 s[0:1], s[4:5] 516; CI-NEXT: v_mov_b32_e32 v1, 0 517; CI-NEXT: s_mov_b64 s[4:5], s[6:7] 518; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 519; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 520; CI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 521; CI-NEXT: v_mov_b32_e32 v1, s8 522; CI-NEXT: s_mov_b32 m0, -1 523; CI-NEXT: v_mov_b32_e32 v3, s9 524; CI-NEXT: s_waitcnt vmcnt(1) 525; CI-NEXT: ds_write_b32 v1, v2 offset:32 526; CI-NEXT: s_waitcnt vmcnt(0) 527; CI-NEXT: ds_write_b32 v3, v0 offset:32 528; CI-NEXT: s_endpgm 529; 530; GFX9-LABEL: write2_ptr_subreg_arg_two_val_f32: 531; GFX9: ; %bb.0: 532; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 533; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c 534; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 535; GFX9-NEXT: s_waitcnt lgkmcnt(0) 536; GFX9-NEXT: global_load_dword v1, v0, s[4:5] 537; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 538; GFX9-NEXT: v_mov_b32_e32 v0, s2 539; GFX9-NEXT: v_mov_b32_e32 v3, s3 540; GFX9-NEXT: s_waitcnt vmcnt(1) 541; GFX9-NEXT: ds_write_b32 v0, v1 offset:32 542; GFX9-NEXT: s_waitcnt vmcnt(0) 543; GFX9-NEXT: ds_write_b32 v3, v2 offset:32 544; GFX9-NEXT: s_endpgm 545 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 546 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 547 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 548 %val0 = load float, float addrspace(1)* %in0.gep, align 4 549 %val1 = load float, float addrspace(1)* %in1.gep, align 4 550 551 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 552 %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 553 %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1 554 %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 555 %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 556 557 ; Apply an additional offset after the vector that will be more obviously folded. 558 %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8 559 store float %val0, float addrspace(3)* %gep.0, align 4 560 561 %add.x = add nsw i32 %x.i, 8 562 store float %val1, float addrspace(3)* %gep.1.offset, align 4 563 ret void 564} 565 566define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { 567; CI-LABEL: simple_write2_one_val_f64: 568; CI: ; %bb.0: 569; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 570; CI-NEXT: s_mov_b32 s3, 0xf000 571; CI-NEXT: s_mov_b32 s2, 0 572; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 573; CI-NEXT: v_mov_b32_e32 v1, 0 574; CI-NEXT: s_waitcnt lgkmcnt(0) 575; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64 576; CI-NEXT: s_mov_b32 m0, -1 577; CI-NEXT: s_waitcnt vmcnt(0) 578; CI-NEXT: ds_write2_b64 v0, v[1:2], v[1:2] offset1:8 579; CI-NEXT: s_endpgm 580; 581; GFX9-LABEL: simple_write2_one_val_f64: 582; GFX9: ; %bb.0: 583; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 584; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 585; GFX9-NEXT: s_waitcnt lgkmcnt(0) 586; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] 587; GFX9-NEXT: s_waitcnt vmcnt(0) 588; GFX9-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:8 589; GFX9-NEXT: s_endpgm 590 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 591 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i 592 %val = load double, double addrspace(1)* %in.gep, align 8 593 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i 594 store double %val, double addrspace(3)* %arrayidx0, align 8 595 %add.x = add nsw i32 %x.i, 8 596 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x 597 store double %val, double addrspace(3)* %arrayidx1, align 8 598 ret void 599} 600 601define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { 602; CI-LABEL: misaligned_simple_write2_one_val_f64: 603; CI: ; %bb.0: 604; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 605; CI-NEXT: s_load_dword s0, s[0:1], 0xd 606; CI-NEXT: s_mov_b32 s7, 0xf000 607; CI-NEXT: s_mov_b32 s6, 0 608; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 609; CI-NEXT: v_mov_b32_e32 v1, 0 610; CI-NEXT: s_waitcnt lgkmcnt(0) 611; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64 612; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 613; CI-NEXT: s_mov_b32 m0, -1 614; CI-NEXT: s_waitcnt vmcnt(0) 615; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 616; CI-NEXT: ds_write2_b32 v0, v1, v2 offset0:14 offset1:15 617; CI-NEXT: s_endpgm 618; 619; GFX9-LABEL: misaligned_simple_write2_one_val_f64: 620; GFX9: ; %bb.0: 621; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 622; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 623; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 624; GFX9-NEXT: s_waitcnt lgkmcnt(0) 625; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 626; GFX9-NEXT: v_add_u32_e32 v2, s4, v2 627; GFX9-NEXT: s_waitcnt vmcnt(0) 628; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 629; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset0:14 offset1:15 630; GFX9-NEXT: s_endpgm 631 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 632 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i 633 %val = load double, double addrspace(1)* %in.gep, align 8 634 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i 635 store double %val, double addrspace(3)* %arrayidx0, align 4 636 %add.x = add nsw i32 %x.i, 7 637 %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x 638 store double %val, double addrspace(3)* %arrayidx1, align 4 639 ret void 640} 641 642define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { 643; CI-LABEL: unaligned_offset_simple_write2_one_val_f64: 644; CI: ; %bb.0: 645; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 646; CI-NEXT: s_load_dword s0, s[0:1], 0xd 647; CI-NEXT: s_mov_b32 s7, 0xf000 648; CI-NEXT: s_mov_b32 s6, 0 649; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 650; CI-NEXT: v_mov_b32_e32 v1, 0 651; CI-NEXT: s_waitcnt lgkmcnt(0) 652; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64 653; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 654; CI-NEXT: s_mov_b32 m0, -1 655; CI-NEXT: s_waitcnt vmcnt(0) 656; CI-NEXT: v_lshrrev_b32_e32 v3, 24, v1 657; CI-NEXT: ds_write_b8 v0, v1 offset:5 658; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 659; CI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 660; CI-NEXT: ds_write_b8 v0, v2 offset:13 661; CI-NEXT: ds_write_b8 v0, v1 offset:9 662; CI-NEXT: v_lshrrev_b32_e32 v1, 24, v2 663; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 664; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 665; CI-NEXT: ds_write_b8 v0, v3 offset:8 666; CI-NEXT: ds_write_b8 v0, v4 offset:7 667; CI-NEXT: ds_write_b8 v0, v5 offset:6 668; CI-NEXT: ds_write_b8 v0, v1 offset:16 669; CI-NEXT: ds_write_b8 v0, v6 offset:15 670; CI-NEXT: ds_write_b8 v0, v2 offset:14 671; CI-NEXT: ds_write_b8 v0, v3 offset:12 672; CI-NEXT: ds_write_b8 v0, v4 offset:11 673; CI-NEXT: ds_write_b8 v0, v5 offset:10 674; CI-NEXT: s_endpgm 675; 676; GFX9-ALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: 677; GFX9-ALIGNED: ; %bb.0: 678; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 679; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x34 680; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 681; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 682; GFX9-ALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 683; GFX9-ALIGNED-NEXT: v_add_u32_e32 v2, s4, v2 684; GFX9-ALIGNED-NEXT: s_waitcnt vmcnt(0) 685; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v3, 24, v0 686; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:7 687; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:5 688; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v0 689; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v1 offset:15 690; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:13 691; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:11 692; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:9 693; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v0, 24, v1 694; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v1, 8, v1 695; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v3 offset:8 696; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v4 offset:6 697; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:16 698; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:14 699; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v3 offset:12 700; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v4 offset:10 701; GFX9-ALIGNED-NEXT: s_endpgm 702; 703; GFX9-UNALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: 704; GFX9-UNALIGNED: ; %bb.0: 705; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 706; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x34 707; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 708; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 709; GFX9-UNALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 710; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, s4, v2 711; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v3, 5, v2 712; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, 9, v2 713; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) 714; GFX9-UNALIGNED-NEXT: ds_write2_b32 v3, v0, v1 offset1:1 715; GFX9-UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 716; GFX9-UNALIGNED-NEXT: s_endpgm 717 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 718 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i 719 %val = load double, double addrspace(1)* %in.gep, align 8 720 %base = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i 721 %base.i8 = bitcast double addrspace(3)* %base to i8 addrspace(3)* 722 %addr0.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 5 723 %addr0 = bitcast i8 addrspace(3)* %addr0.i8 to double addrspace(3)* 724 store double %val, double addrspace(3)* %addr0, align 1 725 %addr1.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 9 726 %addr1 = bitcast i8 addrspace(3)* %addr1.i8 to double addrspace(3)* 727 store double %val, double addrspace(3)* %addr1, align 1 728 ret void 729} 730 731define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { 732; CI-LABEL: simple_write2_two_val_f64: 733; CI: ; %bb.0: 734; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 735; CI-NEXT: s_mov_b32 s3, 0xf000 736; CI-NEXT: s_mov_b32 s2, 0 737; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 738; CI-NEXT: v_mov_b32_e32 v1, 0 739; CI-NEXT: s_waitcnt lgkmcnt(0) 740; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc 741; CI-NEXT: s_waitcnt vmcnt(0) 742; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 glc 743; CI-NEXT: s_waitcnt vmcnt(0) 744; CI-NEXT: s_mov_b32 m0, -1 745; CI-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:8 746; CI-NEXT: s_endpgm 747; 748; GFX9-LABEL: simple_write2_two_val_f64: 749; GFX9: ; %bb.0: 750; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 751; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 752; GFX9-NEXT: s_waitcnt lgkmcnt(0) 753; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc 754; GFX9-NEXT: s_waitcnt vmcnt(0) 755; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc 756; GFX9-NEXT: s_waitcnt vmcnt(0) 757; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:8 758; GFX9-NEXT: s_endpgm 759 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 760 %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i 761 %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1 762 %val0 = load volatile double, double addrspace(1)* %in.gep.0, align 8 763 %val1 = load volatile double, double addrspace(1)* %in.gep.1, align 8 764 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i 765 store double %val0, double addrspace(3)* %arrayidx0, align 8 766 %add.x = add nsw i32 %x.i, 8 767 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x 768 store double %val1, double addrspace(3)* %arrayidx1, align 8 769 ret void 770} 771 772@foo = addrspace(3) global [4 x i32] undef, align 4 773 774define amdgpu_kernel void @store_constant_adjacent_offsets() { 775; CI-LABEL: store_constant_adjacent_offsets: 776; CI: ; %bb.0: 777; CI-NEXT: s_movk_i32 s0, 0x7b 778; CI-NEXT: v_mov_b32_e32 v0, 0 779; CI-NEXT: v_mov_b32_e32 v1, s0 780; CI-NEXT: v_mov_b32_e32 v2, s0 781; CI-NEXT: s_mov_b32 m0, -1 782; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 783; CI-NEXT: s_endpgm 784; 785; GFX9-LABEL: store_constant_adjacent_offsets: 786; GFX9: ; %bb.0: 787; GFX9-NEXT: s_movk_i32 s0, 0x7b 788; GFX9-NEXT: v_mov_b32_e32 v0, 0 789; GFX9-NEXT: v_mov_b32_e32 v1, s0 790; GFX9-NEXT: v_mov_b32_e32 v2, s0 791; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 792; GFX9-NEXT: s_endpgm 793 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 794 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 795 ret void 796} 797 798define amdgpu_kernel void @store_constant_disjoint_offsets() { 799; CI-LABEL: store_constant_disjoint_offsets: 800; CI: ; %bb.0: 801; CI-NEXT: v_mov_b32_e32 v0, 0x7b 802; CI-NEXT: v_mov_b32_e32 v1, 0 803; CI-NEXT: s_mov_b32 m0, -1 804; CI-NEXT: ds_write2_b32 v1, v0, v0 offset1:2 805; CI-NEXT: s_endpgm 806; 807; GFX9-LABEL: store_constant_disjoint_offsets: 808; GFX9: ; %bb.0: 809; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b 810; GFX9-NEXT: v_mov_b32_e32 v1, 0 811; GFX9-NEXT: ds_write2_b32 v1, v0, v0 offset1:2 812; GFX9-NEXT: s_endpgm 813 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 814 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 815 ret void 816} 817 818@bar = addrspace(3) global [4 x i64] undef, align 4 819 820define amdgpu_kernel void @store_misaligned64_constant_offsets() { 821; CI-LABEL: store_misaligned64_constant_offsets: 822; CI: ; %bb.0: 823; CI-NEXT: v_mov_b32_e32 v0, 0 824; CI-NEXT: v_mov_b32_e32 v1, 0x7b 825; CI-NEXT: s_mov_b32 m0, -1 826; CI-NEXT: ds_write2_b32 v0, v1, v0 offset1:1 827; CI-NEXT: ds_write2_b32 v0, v1, v0 offset0:2 offset1:3 828; CI-NEXT: s_endpgm 829; 830; GFX9-ALIGNED-LABEL: store_misaligned64_constant_offsets: 831; GFX9-ALIGNED: ; %bb.0: 832; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0 833; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, 0x7b 834; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v1, v0 offset1:1 835; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v1, v0 offset0:2 offset1:3 836; GFX9-ALIGNED-NEXT: s_endpgm 837; 838; GFX9-UNALIGNED-LABEL: store_misaligned64_constant_offsets: 839; GFX9-UNALIGNED: ; %bb.0: 840; GFX9-UNALIGNED-NEXT: s_movk_i32 s0, 0x7b 841; GFX9-UNALIGNED-NEXT: s_mov_b32 s1, 0 842; GFX9-UNALIGNED-NEXT: s_mov_b32 s2, s0 843; GFX9-UNALIGNED-NEXT: s_mov_b32 s3, s1 844; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 845; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s2 846; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, 0 847; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 848; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s3 849; GFX9-UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 850; GFX9-UNALIGNED-NEXT: s_endpgm 851 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 852 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 853 ret void 854} 855 856@bar.large = addrspace(3) global [4096 x i64] undef, align 4 857 858define amdgpu_kernel void @store_misaligned64_constant_large_offsets() { 859; CI-LABEL: store_misaligned64_constant_large_offsets: 860; CI: ; %bb.0: 861; CI-NEXT: v_mov_b32_e32 v0, 0x4000 862; CI-NEXT: v_mov_b32_e32 v1, 0x7b 863; CI-NEXT: v_mov_b32_e32 v2, 0 864; CI-NEXT: s_mov_b32 m0, -1 865; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 866; CI-NEXT: v_mov_b32_e32 v0, 0x7ff8 867; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 868; CI-NEXT: s_endpgm 869; 870; GFX9-LABEL: store_misaligned64_constant_large_offsets: 871; GFX9: ; %bb.0: 872; GFX9-NEXT: v_mov_b32_e32 v0, 0x4000 873; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b 874; GFX9-NEXT: v_mov_b32_e32 v2, 0 875; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 876; GFX9-NEXT: v_mov_b32_e32 v0, 0x7ff8 877; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 878; GFX9-NEXT: s_endpgm 879 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 880 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 881 ret void 882} 883 884@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4 885@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4 886 887define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 { 888; CI-LABEL: write2_sgemm_sequence: 889; CI: ; %bb.0: 890; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 891; CI-NEXT: s_lshl_b32 s2, s2, 2 892; CI-NEXT: s_add_i32 s3, s2, 0xc20 893; CI-NEXT: v_mov_b32_e32 v0, s3 894; CI-NEXT: s_addk_i32 s2, 0xc60 895; CI-NEXT: s_waitcnt lgkmcnt(0) 896; CI-NEXT: s_load_dword s0, s[0:1], 0x0 897; CI-NEXT: s_mov_b32 m0, -1 898; CI-NEXT: s_waitcnt lgkmcnt(0) 899; CI-NEXT: v_mov_b32_e32 v2, s0 900; CI-NEXT: v_mov_b32_e32 v3, s0 901; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 902; CI-NEXT: v_mov_b32_e32 v0, s2 903; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 904; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v1 905; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 906; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:32 offset1:33 907; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:64 offset1:65 908; CI-NEXT: s_endpgm 909; 910; GFX9-LABEL: write2_sgemm_sequence: 911; GFX9: ; %bb.0: 912; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 913; GFX9-NEXT: s_lshl_b32 s2, s2, 2 914; GFX9-NEXT: s_add_i32 s3, s2, 0xc20 915; GFX9-NEXT: s_addk_i32 s2, 0xc60 916; GFX9-NEXT: v_mov_b32_e32 v0, s3 917; GFX9-NEXT: s_waitcnt lgkmcnt(0) 918; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 919; GFX9-NEXT: v_mov_b32_e32 v2, s2 920; GFX9-NEXT: s_waitcnt lgkmcnt(0) 921; GFX9-NEXT: v_mov_b32_e32 v3, s0 922; GFX9-NEXT: v_mov_b32_e32 v4, s0 923; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset1:1 924; GFX9-NEXT: ds_write2_b32 v2, v3, v4 offset1:1 925; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v1 926; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset1:1 927; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset0:32 offset1:33 928; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset0:64 offset1:65 929; GFX9-NEXT: s_endpgm 930 %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1 931 %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1 932 %val = load float, float addrspace(1)* %in 933 %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i 934 store float %val, float addrspace(3)* %arrayidx44, align 4 935 %add47 = add nsw i32 %x.i, 1 936 %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47 937 store float %val, float addrspace(3)* %arrayidx48, align 4 938 %add51 = add nsw i32 %x.i, 16 939 %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51 940 store float %val, float addrspace(3)* %arrayidx52, align 4 941 %add55 = add nsw i32 %x.i, 17 942 %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55 943 store float %val, float addrspace(3)* %arrayidx56, align 4 944 %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i 945 store float %val, float addrspace(3)* %arrayidx60, align 4 946 %add63 = add nsw i32 %y.i, 1 947 %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63 948 store float %val, float addrspace(3)* %arrayidx64, align 4 949 %add67 = add nsw i32 %y.i, 32 950 %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67 951 store float %val, float addrspace(3)* %arrayidx68, align 4 952 %add71 = add nsw i32 %y.i, 33 953 %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71 954 store float %val, float addrspace(3)* %arrayidx72, align 4 955 %add75 = add nsw i32 %y.i, 64 956 %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75 957 store float %val, float addrspace(3)* %arrayidx76, align 4 958 %add79 = add nsw i32 %y.i, 65 959 %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79 960 store float %val, float addrspace(3)* %arrayidx80, align 4 961 ret void 962} 963 964define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 { 965; CI-LABEL: simple_write2_v4f32_superreg_align4: 966; CI: ; %bb.0: 967; CI-NEXT: s_load_dword s4, s[0:1], 0x9 968; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 969; CI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 970; CI-NEXT: s_mov_b32 m0, -1 971; CI-NEXT: s_waitcnt lgkmcnt(0) 972; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 973; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 974; CI-NEXT: s_waitcnt lgkmcnt(0) 975; CI-NEXT: v_mov_b32_e32 v1, s0 976; CI-NEXT: v_mov_b32_e32 v2, s1 977; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 978; CI-NEXT: v_mov_b32_e32 v3, s2 979; CI-NEXT: v_mov_b32_e32 v1, s3 980; CI-NEXT: ds_write2_b32 v0, v3, v1 offset0:2 offset1:3 981; CI-NEXT: s_endpgm 982; 983; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4: 984; GFX9-ALIGNED: ; %bb.0: 985; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x24 986; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 987; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 988; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4 989; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 990; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 991; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, s0 992; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, s1 993; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v3, s2 994; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v4, s3 995; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 996; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v3, v4 offset0:2 offset1:3 997; GFX9-ALIGNED-NEXT: s_endpgm 998; 999; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4: 1000; GFX9-UNALIGNED: ; %bb.0: 1001; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x24 1002; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1003; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 1004; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v4, v0, 4, s4 1005; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 1006; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 1007; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 1008; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s2 1009; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 1010; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s3 1011; GFX9-UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 1012; GFX9-UNALIGNED-NEXT: s_endpgm 1013 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 1014 %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in 1015 %val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4 1016 %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(3)* %out, i32 %x.i 1017 store <4 x float> %val0, <4 x float> addrspace(3)* %out.gep, align 4 1018 ret void 1019} 1020 1021@v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1 1022 1023define amdgpu_kernel void @write2_v2i32_align1_odd_offset() { 1024; CI-LABEL: write2_v2i32_align1_odd_offset: 1025; CI: ; %bb.0: ; %entry 1026; CI-NEXT: v_mov_b32_e32 v0, 0x7b 1027; CI-NEXT: v_mov_b32_e32 v1, 0 1028; CI-NEXT: s_mov_b32 m0, -1 1029; CI-NEXT: ds_write_b8 v1, v0 offset:65 1030; CI-NEXT: v_mov_b32_e32 v0, 1 1031; CI-NEXT: ds_write_b8 v1, v0 offset:70 1032; CI-NEXT: v_mov_b32_e32 v0, 0xc8 1033; CI-NEXT: ds_write_b8 v1, v0 offset:69 1034; CI-NEXT: ds_write_b8 v1, v1 offset:68 1035; CI-NEXT: ds_write_b8 v1, v1 offset:67 1036; CI-NEXT: ds_write_b8 v1, v1 offset:66 1037; CI-NEXT: ds_write_b8 v1, v1 offset:72 1038; CI-NEXT: ds_write_b8 v1, v1 offset:71 1039; CI-NEXT: s_endpgm 1040; 1041; GFX9-ALIGNED-LABEL: write2_v2i32_align1_odd_offset: 1042; GFX9-ALIGNED: ; %bb.0: ; %entry 1043; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b 1044; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, 0 1045; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:65 1046; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 1 1047; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:70 1048; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0xc8 1049; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:69 1050; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:68 1051; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:67 1052; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:66 1053; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:72 1054; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:71 1055; GFX9-ALIGNED-NEXT: s_endpgm 1056; 1057; GFX9-UNALIGNED-LABEL: write2_v2i32_align1_odd_offset: 1058; GFX9-UNALIGNED: ; %bb.0: ; %entry 1059; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x41 1060; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0x7b 1061; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x1c8 1062; GFX9-UNALIGNED-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 1063; GFX9-UNALIGNED-NEXT: s_endpgm 1064entry: 1065 store <2 x i32> <i32 123, i32 456>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1 1066 ret void 1067} 1068 1069declare i32 @llvm.amdgcn.workgroup.id.x() #1 1070declare i32 @llvm.amdgcn.workgroup.id.y() #1 1071declare i32 @llvm.amdgcn.workitem.id.x() #1 1072declare i32 @llvm.amdgcn.workitem.id.y() #1 1073 1074attributes #0 = { nounwind } 1075attributes #1 = { nounwind readnone speculatable } 1076attributes #2 = { convergent nounwind } 1077