1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s 3; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-ALIGNED %s 4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s 5 6@lds = addrspace(3) global [512 x float] undef, align 4 7@lds.f64 = addrspace(3) global [512 x double] undef, align 8 8 9define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { 10; CI-LABEL: simple_write2_one_val_f32: 11; CI: ; %bb.0: 12; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 13; CI-NEXT: s_mov_b32 s3, 0xf000 14; CI-NEXT: s_mov_b32 s2, 0 15; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 16; CI-NEXT: v_mov_b32_e32 v1, 0 17; CI-NEXT: s_waitcnt lgkmcnt(0) 18; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 19; CI-NEXT: s_mov_b32 m0, -1 20; CI-NEXT: s_waitcnt vmcnt(0) 21; CI-NEXT: ds_write2_b32 v0, v1, v1 offset1:8 22; CI-NEXT: s_endpgm 23; 24; GFX9-LABEL: simple_write2_one_val_f32: 25; GFX9: ; %bb.0: 26; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 27; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 28; GFX9-NEXT: s_waitcnt lgkmcnt(0) 29; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 30; GFX9-NEXT: s_waitcnt vmcnt(0) 31; GFX9-NEXT: ds_write2_b32 v0, v1, v1 offset1:8 32; GFX9-NEXT: s_endpgm 33 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 34 %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i 35 %val = load float, float addrspace(1)* %in.gep, align 4 36 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 37 store float %val, float addrspace(3)* %arrayidx0, align 4 38 %add.x = add nsw i32 %x.i, 8 39 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 40 store float %val, float addrspace(3)* %arrayidx1, align 4 41 ret void 42} 43 44define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { 45; CI-LABEL: simple_write2_two_val_f32: 46; CI: ; %bb.0: 47; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 48; CI-NEXT: s_mov_b32 s3, 0xf000 49; CI-NEXT: s_mov_b32 s2, 0 50; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 51; CI-NEXT: v_mov_b32_e32 v1, 0 52; CI-NEXT: s_waitcnt lgkmcnt(0) 53; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 54; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 55; CI-NEXT: s_mov_b32 m0, -1 56; CI-NEXT: s_waitcnt vmcnt(0) 57; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:8 58; CI-NEXT: s_endpgm 59; 60; GFX9-LABEL: simple_write2_two_val_f32: 61; GFX9: ; %bb.0: 62; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 63; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 64; GFX9-NEXT: s_waitcnt lgkmcnt(0) 65; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 66; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 67; GFX9-NEXT: s_waitcnt vmcnt(0) 68; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8 69; GFX9-NEXT: s_endpgm 70 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 71 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i 72 %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 73 %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4 74 %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4 75 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 76 store float %val0, float addrspace(3)* %arrayidx0, align 4 77 %add.x = add nsw i32 %x.i, 8 78 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 79 store float %val1, float addrspace(3)* %arrayidx1, align 4 80 ret void 81} 82 83define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 84; CI-LABEL: simple_write2_two_val_f32_volatile_0: 85; CI: ; %bb.0: 86; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 87; CI-NEXT: s_mov_b32 s3, 0xf000 88; CI-NEXT: s_mov_b32 s2, 0 89; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 90; CI-NEXT: v_mov_b32_e32 v1, 0 91; CI-NEXT: s_waitcnt lgkmcnt(0) 92; CI-NEXT: s_mov_b64 s[0:1], s[4:5] 93; CI-NEXT: s_mov_b64 s[4:5], s[6:7] 94; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 95; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 96; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 97; CI-NEXT: s_mov_b32 m0, -1 98; CI-NEXT: s_waitcnt vmcnt(1) 99; CI-NEXT: ds_write_b32 v0, v2 100; CI-NEXT: s_waitcnt vmcnt(0) 101; CI-NEXT: ds_write_b32 v0, v1 offset:32 102; CI-NEXT: s_endpgm 103; 104; GFX9-LABEL: simple_write2_two_val_f32_volatile_0: 105; GFX9: ; %bb.0: 106; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 107; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 108; GFX9-NEXT: s_waitcnt lgkmcnt(0) 109; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 110; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 111; GFX9-NEXT: s_waitcnt vmcnt(1) 112; GFX9-NEXT: ds_write_b32 v0, v1 113; GFX9-NEXT: s_waitcnt vmcnt(0) 114; GFX9-NEXT: ds_write_b32 v0, v2 offset:32 115; GFX9-NEXT: s_endpgm 116 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 117 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 118 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 119 %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4 120 %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4 121 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 122 store volatile float %val0, float addrspace(3)* %arrayidx0, align 4 123 %add.x = add nsw i32 %x.i, 8 124 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 125 store float %val1, float addrspace(3)* %arrayidx1, align 4 126 ret void 127} 128 129define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 130; CI-LABEL: simple_write2_two_val_f32_volatile_1: 131; CI: ; %bb.0: 132; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 133; CI-NEXT: s_mov_b32 s3, 0xf000 134; CI-NEXT: s_mov_b32 s2, 0 135; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 136; CI-NEXT: v_mov_b32_e32 v1, 0 137; CI-NEXT: s_waitcnt lgkmcnt(0) 138; CI-NEXT: s_mov_b64 s[0:1], s[4:5] 139; CI-NEXT: s_mov_b64 s[4:5], s[6:7] 140; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 141; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 142; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 143; CI-NEXT: s_mov_b32 m0, -1 144; CI-NEXT: s_waitcnt vmcnt(1) 145; CI-NEXT: ds_write_b32 v0, v2 146; CI-NEXT: s_waitcnt vmcnt(0) 147; CI-NEXT: ds_write_b32 v0, v1 offset:32 148; CI-NEXT: s_endpgm 149; 150; GFX9-LABEL: simple_write2_two_val_f32_volatile_1: 151; GFX9: ; %bb.0: 152; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 153; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 154; GFX9-NEXT: s_waitcnt lgkmcnt(0) 155; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 156; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 157; GFX9-NEXT: s_waitcnt vmcnt(1) 158; GFX9-NEXT: ds_write_b32 v0, v1 159; GFX9-NEXT: s_waitcnt vmcnt(0) 160; GFX9-NEXT: ds_write_b32 v0, v2 offset:32 161; GFX9-NEXT: s_endpgm 162 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 163 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 164 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 165 %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4 166 %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4 167 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 168 store float %val0, float addrspace(3)* %arrayidx0, align 4 169 %add.x = add nsw i32 %x.i, 8 170 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 171 store volatile float %val1, float addrspace(3)* %arrayidx1, align 4 172 ret void 173} 174 175; 2 data subregisters from different super registers. 176; TODO: GFX9 has v_mov_b32_e32 v2, lds@abs32@lo 177; This should be an s_mov_b32. The v_mov_b32 gets introduced by an 178; early legalization of the constant bus constraint on the v_lshl_add_u32, 179; and then SIFoldOperands folds in an unlucky order. 180define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { 181; CI-LABEL: simple_write2_two_val_subreg2_mixed_f32: 182; CI: ; %bb.0: 183; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 184; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 185; CI-NEXT: s_mov_b32 s3, 0xf000 186; CI-NEXT: s_mov_b32 s2, 0 187; CI-NEXT: v_mov_b32_e32 v2, 0 188; CI-NEXT: s_waitcnt lgkmcnt(0) 189; CI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64 190; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 offset:8 191; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 192; CI-NEXT: s_mov_b32 m0, -1 193; CI-NEXT: s_waitcnt vmcnt(0) 194; CI-NEXT: ds_write2_b32 v0, v3, v2 offset1:8 195; CI-NEXT: s_endpgm 196; 197; GFX9-LABEL: simple_write2_two_val_subreg2_mixed_f32: 198; GFX9: ; %bb.0: 199; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 200; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 201; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 202; GFX9-NEXT: s_waitcnt lgkmcnt(0) 203; GFX9-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1] 204; GFX9-NEXT: s_waitcnt vmcnt(0) 205; GFX9-NEXT: global_load_dwordx2 v[2:3], v3, s[0:1] offset:8 206; GFX9-NEXT: s_waitcnt vmcnt(0) 207; GFX9-NEXT: ds_write2_b32 v0, v1, v3 offset1:8 208; GFX9-NEXT: s_endpgm 209 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 210 %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i 211 %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1 212 %val0 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8 213 %val1 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8 214 %val0.0 = extractelement <2 x float> %val0, i32 0 215 %val1.1 = extractelement <2 x float> %val1, i32 1 216 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 217 store float %val0.0, float addrspace(3)* %arrayidx0, align 4 218 %add.x = add nsw i32 %x.i, 8 219 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 220 store float %val1.1, float addrspace(3)* %arrayidx1, align 4 221 ret void 222} 223 224define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { 225; CI-LABEL: simple_write2_two_val_subreg2_f32: 226; CI: ; %bb.0: 227; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 228; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 229; CI-NEXT: s_mov_b32 s3, 0xf000 230; CI-NEXT: s_mov_b32 s2, 0 231; CI-NEXT: v_mov_b32_e32 v2, 0 232; CI-NEXT: s_waitcnt lgkmcnt(0) 233; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 234; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 235; CI-NEXT: s_mov_b32 m0, -1 236; CI-NEXT: s_waitcnt vmcnt(0) 237; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:8 238; CI-NEXT: s_endpgm 239; 240; GFX9-LABEL: simple_write2_two_val_subreg2_f32: 241; GFX9: ; %bb.0: 242; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 243; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v0 244; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 245; GFX9-NEXT: s_waitcnt lgkmcnt(0) 246; GFX9-NEXT: global_load_dwordx2 v[1:2], v1, s[0:1] 247; GFX9-NEXT: s_waitcnt vmcnt(0) 248; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8 249; GFX9-NEXT: s_endpgm 250 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 251 %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i 252 %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8 253 %val0 = extractelement <2 x float> %val, i32 0 254 %val1 = extractelement <2 x float> %val, i32 1 255 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 256 store float %val0, float addrspace(3)* %arrayidx0, align 4 257 %add.x = add nsw i32 %x.i, 8 258 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 259 store float %val1, float addrspace(3)* %arrayidx1, align 4 260 ret void 261} 262 263define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 { 264; CI-LABEL: simple_write2_two_val_subreg4_f32: 265; CI: ; %bb.0: 266; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 267; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v0 268; CI-NEXT: s_mov_b32 s3, 0xf000 269; CI-NEXT: s_mov_b32 s2, 0 270; CI-NEXT: v_mov_b32_e32 v2, 0 271; CI-NEXT: s_waitcnt lgkmcnt(0) 272; CI-NEXT: buffer_load_dwordx4 v[1:4], v[1:2], s[0:3], 0 addr64 273; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 274; CI-NEXT: s_mov_b32 m0, -1 275; CI-NEXT: s_waitcnt vmcnt(0) 276; CI-NEXT: ds_write2_b32 v0, v1, v4 offset1:8 277; CI-NEXT: s_endpgm 278; 279; GFX9-LABEL: simple_write2_two_val_subreg4_f32: 280; GFX9: ; %bb.0: 281; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 282; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v0 283; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 284; GFX9-NEXT: s_waitcnt lgkmcnt(0) 285; GFX9-NEXT: global_load_dwordx4 v[1:4], v1, s[0:1] 286; GFX9-NEXT: s_waitcnt vmcnt(0) 287; GFX9-NEXT: ds_write2_b32 v0, v1, v4 offset1:8 288; GFX9-NEXT: s_endpgm 289 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 290 %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i 291 %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16 292 %val0 = extractelement <4 x float> %val, i32 0 293 %val1 = extractelement <4 x float> %val, i32 3 294 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 295 store float %val0, float addrspace(3)* %arrayidx0, align 4 296 %add.x = add nsw i32 %x.i, 8 297 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 298 store float %val1, float addrspace(3)* %arrayidx1, align 4 299 ret void 300} 301 302define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { 303; CI-LABEL: simple_write2_two_val_max_offset_f32: 304; CI: ; %bb.0: 305; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 306; CI-NEXT: s_mov_b32 s3, 0xf000 307; CI-NEXT: s_mov_b32 s2, 0 308; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 309; CI-NEXT: v_mov_b32_e32 v1, 0 310; CI-NEXT: s_waitcnt lgkmcnt(0) 311; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 312; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 313; CI-NEXT: s_mov_b32 m0, -1 314; CI-NEXT: s_waitcnt vmcnt(0) 315; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:255 316; CI-NEXT: s_endpgm 317; 318; GFX9-LABEL: simple_write2_two_val_max_offset_f32: 319; GFX9: ; %bb.0: 320; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 321; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 322; GFX9-NEXT: s_waitcnt lgkmcnt(0) 323; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 324; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 325; GFX9-NEXT: s_waitcnt vmcnt(0) 326; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:255 327; GFX9-NEXT: s_endpgm 328 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 329 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i 330 %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 331 %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4 332 %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4 333 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 334 store float %val0, float addrspace(3)* %arrayidx0, align 4 335 %add.x = add nsw i32 %x.i, 255 336 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 337 store float %val1, float addrspace(3)* %arrayidx1, align 4 338 ret void 339} 340 341define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 342; CI-LABEL: simple_write2_two_val_too_far_f32: 343; CI: ; %bb.0: 344; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 345; CI-NEXT: s_mov_b32 s3, 0xf000 346; CI-NEXT: s_mov_b32 s2, 0 347; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 348; CI-NEXT: v_mov_b32_e32 v1, 0 349; CI-NEXT: s_waitcnt lgkmcnt(0) 350; CI-NEXT: s_mov_b64 s[0:1], s[4:5] 351; CI-NEXT: s_mov_b64 s[4:5], s[6:7] 352; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 353; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 354; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 355; CI-NEXT: s_mov_b32 m0, -1 356; CI-NEXT: s_waitcnt vmcnt(1) 357; CI-NEXT: ds_write_b32 v0, v2 358; CI-NEXT: s_waitcnt vmcnt(0) 359; CI-NEXT: ds_write_b32 v0, v1 offset:1028 360; CI-NEXT: s_endpgm 361; 362; GFX9-LABEL: simple_write2_two_val_too_far_f32: 363; GFX9: ; %bb.0: 364; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 365; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 366; GFX9-NEXT: s_waitcnt lgkmcnt(0) 367; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 368; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 369; GFX9-NEXT: s_waitcnt vmcnt(1) 370; GFX9-NEXT: ds_write_b32 v0, v1 371; GFX9-NEXT: s_waitcnt vmcnt(0) 372; GFX9-NEXT: ds_write_b32 v0, v2 offset:1028 373; GFX9-NEXT: s_endpgm 374 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 375 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 376 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 377 %val0 = load float, float addrspace(1)* %in0.gep, align 4 378 %val1 = load float, float addrspace(1)* %in1.gep, align 4 379 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 380 store float %val0, float addrspace(3)* %arrayidx0, align 4 381 %add.x = add nsw i32 %x.i, 257 382 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 383 store float %val1, float addrspace(3)* %arrayidx1, align 4 384 ret void 385} 386 387define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 388; CI-LABEL: simple_write2_two_val_f32_x2: 389; CI: ; %bb.0: 390; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 391; CI-NEXT: s_mov_b32 s3, 0xf000 392; CI-NEXT: s_mov_b32 s2, 0 393; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 394; CI-NEXT: v_mov_b32_e32 v1, 0 395; CI-NEXT: s_waitcnt lgkmcnt(0) 396; CI-NEXT: s_mov_b64 s[0:1], s[4:5] 397; CI-NEXT: s_mov_b64 s[4:5], s[6:7] 398; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 399; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 400; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 401; CI-NEXT: s_mov_b32 m0, -1 402; CI-NEXT: s_waitcnt vmcnt(0) 403; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:8 404; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:11 offset1:27 405; CI-NEXT: s_endpgm 406; 407; GFX9-LABEL: simple_write2_two_val_f32_x2: 408; GFX9: ; %bb.0: 409; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 410; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 411; GFX9-NEXT: s_waitcnt lgkmcnt(0) 412; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 413; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 414; GFX9-NEXT: s_waitcnt vmcnt(0) 415; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8 416; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27 417; GFX9-NEXT: s_endpgm 418 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 419 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x 420 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x 421 %val0 = load float, float addrspace(1)* %in0.gep, align 4 422 %val1 = load float, float addrspace(1)* %in1.gep, align 4 423 424 %idx.0 = add nsw i32 %tid.x, 0 425 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 426 store float %val0, float addrspace(3)* %arrayidx0, align 4 427 428 %idx.1 = add nsw i32 %tid.x, 8 429 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 430 store float %val1, float addrspace(3)* %arrayidx1, align 4 431 432 %idx.2 = add nsw i32 %tid.x, 11 433 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 434 store float %val0, float addrspace(3)* %arrayidx2, align 4 435 436 %idx.3 = add nsw i32 %tid.x, 27 437 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 438 store float %val1, float addrspace(3)* %arrayidx3, align 4 439 440 ret void 441} 442 443define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 444; CI-LABEL: simple_write2_two_val_f32_x2_nonzero_base: 445; CI: ; %bb.0: 446; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 447; CI-NEXT: s_mov_b32 s3, 0xf000 448; CI-NEXT: s_mov_b32 s2, 0 449; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 450; CI-NEXT: v_mov_b32_e32 v1, 0 451; CI-NEXT: s_waitcnt lgkmcnt(0) 452; CI-NEXT: s_mov_b64 s[0:1], s[4:5] 453; CI-NEXT: s_mov_b64 s[4:5], s[6:7] 454; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 455; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 456; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 457; CI-NEXT: s_mov_b32 m0, -1 458; CI-NEXT: s_waitcnt vmcnt(0) 459; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:3 offset1:8 460; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:11 offset1:27 461; CI-NEXT: s_endpgm 462; 463; GFX9-LABEL: simple_write2_two_val_f32_x2_nonzero_base: 464; GFX9: ; %bb.0: 465; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 466; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 467; GFX9-NEXT: s_waitcnt lgkmcnt(0) 468; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 469; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 470; GFX9-NEXT: s_waitcnt vmcnt(0) 471; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:8 472; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27 473; GFX9-NEXT: s_endpgm 474 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 475 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x 476 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x 477 %val0 = load float, float addrspace(1)* %in0.gep, align 4 478 %val1 = load float, float addrspace(1)* %in1.gep, align 4 479 480 %idx.0 = add nsw i32 %tid.x, 3 481 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 482 store float %val0, float addrspace(3)* %arrayidx0, align 4 483 484 %idx.1 = add nsw i32 %tid.x, 8 485 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 486 store float %val1, float addrspace(3)* %arrayidx1, align 4 487 488 %idx.2 = add nsw i32 %tid.x, 11 489 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 490 store float %val0, float addrspace(3)* %arrayidx2, align 4 491 492 %idx.3 = add nsw i32 %tid.x, 27 493 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 494 store float %val1, float addrspace(3)* %arrayidx3, align 4 495 496 ret void 497} 498 499define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 { 500; CI-LABEL: write2_ptr_subreg_arg_two_val_f32: 501; CI: ; %bb.0: 502; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 503; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf 504; CI-NEXT: s_mov_b32 s3, 0xf000 505; CI-NEXT: s_mov_b32 s2, 0 506; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 507; CI-NEXT: s_waitcnt lgkmcnt(0) 508; CI-NEXT: s_mov_b64 s[0:1], s[4:5] 509; CI-NEXT: v_mov_b32_e32 v1, 0 510; CI-NEXT: s_mov_b64 s[4:5], s[6:7] 511; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 512; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 513; CI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 514; CI-NEXT: v_mov_b32_e32 v1, s8 515; CI-NEXT: s_mov_b32 m0, -1 516; CI-NEXT: v_mov_b32_e32 v3, s9 517; CI-NEXT: s_waitcnt vmcnt(1) 518; CI-NEXT: ds_write_b32 v1, v2 offset:32 519; CI-NEXT: s_waitcnt vmcnt(0) 520; CI-NEXT: ds_write_b32 v3, v0 offset:32 521; CI-NEXT: s_endpgm 522; 523; GFX9-LABEL: write2_ptr_subreg_arg_two_val_f32: 524; GFX9: ; %bb.0: 525; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 526; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c 527; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 528; GFX9-NEXT: s_waitcnt lgkmcnt(0) 529; GFX9-NEXT: global_load_dword v1, v0, s[4:5] 530; GFX9-NEXT: global_load_dword v0, v0, s[6:7] 531; GFX9-NEXT: v_mov_b32_e32 v2, s0 532; GFX9-NEXT: v_mov_b32_e32 v3, s1 533; GFX9-NEXT: s_waitcnt vmcnt(1) 534; GFX9-NEXT: ds_write_b32 v2, v1 offset:32 535; GFX9-NEXT: s_waitcnt vmcnt(0) 536; GFX9-NEXT: ds_write_b32 v3, v0 offset:32 537; GFX9-NEXT: s_endpgm 538 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 539 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 540 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 541 %val0 = load float, float addrspace(1)* %in0.gep, align 4 542 %val1 = load float, float addrspace(1)* %in1.gep, align 4 543 544 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 545 %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 546 %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1 547 %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 548 %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 549 550 ; Apply an additional offset after the vector that will be more obviously folded. 551 %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8 552 store float %val0, float addrspace(3)* %gep.0, align 4 553 554 %add.x = add nsw i32 %x.i, 8 555 store float %val1, float addrspace(3)* %gep.1.offset, align 4 556 ret void 557} 558 559define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { 560; CI-LABEL: simple_write2_one_val_f64: 561; CI: ; %bb.0: 562; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 563; CI-NEXT: s_mov_b32 s3, 0xf000 564; CI-NEXT: s_mov_b32 s2, 0 565; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 566; CI-NEXT: v_mov_b32_e32 v1, 0 567; CI-NEXT: s_waitcnt lgkmcnt(0) 568; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64 569; CI-NEXT: s_mov_b32 m0, -1 570; CI-NEXT: s_waitcnt vmcnt(0) 571; CI-NEXT: ds_write2_b64 v0, v[1:2], v[1:2] offset1:8 572; CI-NEXT: s_endpgm 573; 574; GFX9-LABEL: simple_write2_one_val_f64: 575; GFX9: ; %bb.0: 576; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 577; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 578; GFX9-NEXT: s_waitcnt lgkmcnt(0) 579; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] 580; GFX9-NEXT: s_waitcnt vmcnt(0) 581; GFX9-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:8 582; GFX9-NEXT: s_endpgm 583 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 584 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i 585 %val = load double, double addrspace(1)* %in.gep, align 8 586 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i 587 store double %val, double addrspace(3)* %arrayidx0, align 8 588 %add.x = add nsw i32 %x.i, 8 589 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x 590 store double %val, double addrspace(3)* %arrayidx1, align 8 591 ret void 592} 593 594define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { 595; CI-LABEL: misaligned_simple_write2_one_val_f64: 596; CI: ; %bb.0: 597; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 598; CI-NEXT: s_load_dword s0, s[0:1], 0xd 599; CI-NEXT: s_mov_b32 s7, 0xf000 600; CI-NEXT: s_mov_b32 s6, 0 601; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 602; CI-NEXT: v_mov_b32_e32 v1, 0 603; CI-NEXT: s_waitcnt lgkmcnt(0) 604; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64 605; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 606; CI-NEXT: s_mov_b32 m0, -1 607; CI-NEXT: s_waitcnt vmcnt(0) 608; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 609; CI-NEXT: ds_write2_b32 v0, v1, v2 offset0:14 offset1:15 610; CI-NEXT: s_endpgm 611; 612; GFX9-LABEL: misaligned_simple_write2_one_val_f64: 613; GFX9: ; %bb.0: 614; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 615; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 616; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 617; GFX9-NEXT: s_waitcnt lgkmcnt(0) 618; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 619; GFX9-NEXT: v_add_u32_e32 v2, s0, v2 620; GFX9-NEXT: s_waitcnt vmcnt(0) 621; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 622; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset0:14 offset1:15 623; GFX9-NEXT: s_endpgm 624 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 625 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i 626 %val = load double, double addrspace(1)* %in.gep, align 8 627 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i 628 store double %val, double addrspace(3)* %arrayidx0, align 4 629 %add.x = add nsw i32 %x.i, 7 630 %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x 631 store double %val, double addrspace(3)* %arrayidx1, align 4 632 ret void 633} 634 635define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { 636; CI-LABEL: unaligned_offset_simple_write2_one_val_f64: 637; CI: ; %bb.0: 638; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 639; CI-NEXT: s_load_dword s0, s[0:1], 0xd 640; CI-NEXT: s_mov_b32 s7, 0xf000 641; CI-NEXT: s_mov_b32 s6, 0 642; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 643; CI-NEXT: v_mov_b32_e32 v1, 0 644; CI-NEXT: s_waitcnt lgkmcnt(0) 645; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64 646; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 647; CI-NEXT: s_mov_b32 m0, -1 648; CI-NEXT: s_waitcnt vmcnt(0) 649; CI-NEXT: v_lshrrev_b32_e32 v3, 24, v1 650; CI-NEXT: ds_write_b8 v0, v1 offset:5 651; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 652; CI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 653; CI-NEXT: ds_write_b8 v0, v2 offset:13 654; CI-NEXT: ds_write_b8 v0, v1 offset:9 655; CI-NEXT: v_lshrrev_b32_e32 v1, 24, v2 656; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 657; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 658; CI-NEXT: ds_write_b8 v0, v3 offset:8 659; CI-NEXT: ds_write_b8 v0, v4 offset:7 660; CI-NEXT: ds_write_b8 v0, v5 offset:6 661; CI-NEXT: ds_write_b8 v0, v1 offset:16 662; CI-NEXT: ds_write_b8 v0, v6 offset:15 663; CI-NEXT: ds_write_b8 v0, v2 offset:14 664; CI-NEXT: ds_write_b8 v0, v3 offset:12 665; CI-NEXT: ds_write_b8 v0, v4 offset:11 666; CI-NEXT: ds_write_b8 v0, v5 offset:10 667; CI-NEXT: s_endpgm 668; 669; GFX9-ALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: 670; GFX9-ALIGNED: ; %bb.0: 671; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 672; GFX9-ALIGNED-NEXT: s_load_dword s0, s[0:1], 0x34 673; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 674; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 675; GFX9-ALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 676; GFX9-ALIGNED-NEXT: v_add_u32_e32 v2, s0, v2 677; GFX9-ALIGNED-NEXT: s_waitcnt vmcnt(0) 678; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v3, 24, v0 679; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:7 680; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:5 681; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v0 682; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v1 offset:15 683; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:13 684; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:11 685; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:9 686; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v0, 24, v1 687; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v1, 8, v1 688; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v3 offset:8 689; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v4 offset:6 690; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:16 691; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:14 692; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v3 offset:12 693; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v4 offset:10 694; GFX9-ALIGNED-NEXT: s_endpgm 695; 696; GFX9-UNALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: 697; GFX9-UNALIGNED: ; %bb.0: 698; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 699; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[0:1], 0x34 700; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 701; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 702; GFX9-UNALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 703; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, s0, v2 704; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v3, 5, v2 705; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, 9, v2 706; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) 707; GFX9-UNALIGNED-NEXT: ds_write2_b32 v3, v0, v1 offset1:1 708; GFX9-UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 709; GFX9-UNALIGNED-NEXT: s_endpgm 710 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 711 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i 712 %val = load double, double addrspace(1)* %in.gep, align 8 713 %base = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i 714 %base.i8 = bitcast double addrspace(3)* %base to i8 addrspace(3)* 715 %addr0.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 5 716 %addr0 = bitcast i8 addrspace(3)* %addr0.i8 to double addrspace(3)* 717 store double %val, double addrspace(3)* %addr0, align 1 718 %addr1.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 9 719 %addr1 = bitcast i8 addrspace(3)* %addr1.i8 to double addrspace(3)* 720 store double %val, double addrspace(3)* %addr1, align 1 721 ret void 722} 723 724define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { 725; CI-LABEL: simple_write2_two_val_f64: 726; CI: ; %bb.0: 727; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 728; CI-NEXT: s_mov_b32 s3, 0xf000 729; CI-NEXT: s_mov_b32 s2, 0 730; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 731; CI-NEXT: v_mov_b32_e32 v1, 0 732; CI-NEXT: s_waitcnt lgkmcnt(0) 733; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 734; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 735; CI-NEXT: s_mov_b32 m0, -1 736; CI-NEXT: s_waitcnt vmcnt(0) 737; CI-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:8 738; CI-NEXT: s_endpgm 739; 740; GFX9-LABEL: simple_write2_two_val_f64: 741; GFX9: ; %bb.0: 742; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 743; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 744; GFX9-NEXT: s_waitcnt lgkmcnt(0) 745; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] 746; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 747; GFX9-NEXT: s_waitcnt vmcnt(0) 748; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:8 749; GFX9-NEXT: s_endpgm 750 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 751 %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i 752 %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1 753 %val0 = load volatile double, double addrspace(1)* %in.gep.0, align 8 754 %val1 = load volatile double, double addrspace(1)* %in.gep.1, align 8 755 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i 756 store double %val0, double addrspace(3)* %arrayidx0, align 8 757 %add.x = add nsw i32 %x.i, 8 758 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x 759 store double %val1, double addrspace(3)* %arrayidx1, align 8 760 ret void 761} 762 763@foo = addrspace(3) global [4 x i32] undef, align 4 764 765define amdgpu_kernel void @store_constant_adjacent_offsets() { 766; CI-LABEL: store_constant_adjacent_offsets: 767; CI: ; %bb.0: 768; CI-NEXT: s_movk_i32 s0, 0x7b 769; CI-NEXT: v_mov_b32_e32 v0, 0 770; CI-NEXT: v_mov_b32_e32 v1, s0 771; CI-NEXT: v_mov_b32_e32 v2, s0 772; CI-NEXT: s_mov_b32 m0, -1 773; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 774; CI-NEXT: s_endpgm 775; 776; GFX9-LABEL: store_constant_adjacent_offsets: 777; GFX9: ; %bb.0: 778; GFX9-NEXT: s_movk_i32 s0, 0x7b 779; GFX9-NEXT: v_mov_b32_e32 v0, 0 780; GFX9-NEXT: v_mov_b32_e32 v1, s0 781; GFX9-NEXT: v_mov_b32_e32 v2, s0 782; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 783; GFX9-NEXT: s_endpgm 784 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 785 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 786 ret void 787} 788 789define amdgpu_kernel void @store_constant_disjoint_offsets() { 790; CI-LABEL: store_constant_disjoint_offsets: 791; CI: ; %bb.0: 792; CI-NEXT: v_mov_b32_e32 v0, 0x7b 793; CI-NEXT: v_mov_b32_e32 v1, 0 794; CI-NEXT: s_mov_b32 m0, -1 795; CI-NEXT: ds_write2_b32 v1, v0, v0 offset1:2 796; CI-NEXT: s_endpgm 797; 798; GFX9-LABEL: store_constant_disjoint_offsets: 799; GFX9: ; %bb.0: 800; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b 801; GFX9-NEXT: v_mov_b32_e32 v1, 0 802; GFX9-NEXT: ds_write2_b32 v1, v0, v0 offset1:2 803; GFX9-NEXT: s_endpgm 804 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 805 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 806 ret void 807} 808 809@bar = addrspace(3) global [4 x i64] undef, align 4 810 811define amdgpu_kernel void @store_misaligned64_constant_offsets() { 812; CI-LABEL: store_misaligned64_constant_offsets: 813; CI: ; %bb.0: 814; CI-NEXT: v_mov_b32_e32 v0, 0 815; CI-NEXT: v_mov_b32_e32 v1, 0x7b 816; CI-NEXT: s_mov_b32 m0, -1 817; CI-NEXT: ds_write2_b32 v0, v1, v0 offset1:1 818; CI-NEXT: ds_write2_b32 v0, v1, v0 offset0:2 offset1:3 819; CI-NEXT: s_endpgm 820; 821; GFX9-ALIGNED-LABEL: store_misaligned64_constant_offsets: 822; GFX9-ALIGNED: ; %bb.0: 823; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0 824; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, 0x7b 825; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v1, v0 offset1:1 826; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v1, v0 offset0:2 offset1:3 827; GFX9-ALIGNED-NEXT: s_endpgm 828; 829; GFX9-UNALIGNED-LABEL: store_misaligned64_constant_offsets: 830; GFX9-UNALIGNED: ; %bb.0: 831; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b 832; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0 833; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, v0 834; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, v1 835; GFX9-UNALIGNED-NEXT: ds_write_b128 v1, v[0:3] 836; GFX9-UNALIGNED-NEXT: s_endpgm 837 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 838 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 839 ret void 840} 841 842@bar.large = addrspace(3) global [4096 x i64] undef, align 4 843 844define amdgpu_kernel void @store_misaligned64_constant_large_offsets() { 845; CI-LABEL: store_misaligned64_constant_large_offsets: 846; CI: ; %bb.0: 847; CI-NEXT: v_mov_b32_e32 v0, 0x4000 848; CI-NEXT: v_mov_b32_e32 v1, 0x7b 849; CI-NEXT: v_mov_b32_e32 v2, 0 850; CI-NEXT: s_mov_b32 m0, -1 851; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 852; CI-NEXT: v_mov_b32_e32 v0, 0x7ff8 853; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 854; CI-NEXT: s_endpgm 855; 856; GFX9-LABEL: store_misaligned64_constant_large_offsets: 857; GFX9: ; %bb.0: 858; GFX9-NEXT: v_mov_b32_e32 v0, 0x4000 859; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b 860; GFX9-NEXT: v_mov_b32_e32 v2, 0 861; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 862; GFX9-NEXT: v_mov_b32_e32 v0, 0x7ff8 863; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 864; GFX9-NEXT: s_endpgm 865 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 866 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 867 ret void 868} 869 870@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4 871@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4 872 873define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 { 874; CI-LABEL: write2_sgemm_sequence: 875; CI: ; %bb.0: 876; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 877; CI-NEXT: s_lshl_b32 s2, s2, 2 878; CI-NEXT: s_add_i32 s3, s2, 0xc20 879; CI-NEXT: v_mov_b32_e32 v0, s3 880; CI-NEXT: s_addk_i32 s2, 0xc60 881; CI-NEXT: s_waitcnt lgkmcnt(0) 882; CI-NEXT: s_load_dword s0, s[0:1], 0x0 883; CI-NEXT: s_mov_b32 m0, -1 884; CI-NEXT: s_waitcnt lgkmcnt(0) 885; CI-NEXT: v_mov_b32_e32 v2, s0 886; CI-NEXT: v_mov_b32_e32 v3, s0 887; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 888; CI-NEXT: v_mov_b32_e32 v0, s2 889; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 890; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v1 891; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 892; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:32 offset1:33 893; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:64 offset1:65 894; CI-NEXT: s_endpgm 895; 896; GFX9-LABEL: write2_sgemm_sequence: 897; GFX9: ; %bb.0: 898; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 899; GFX9-NEXT: s_lshl_b32 s2, s2, 2 900; GFX9-NEXT: s_add_i32 s3, s2, 0xc20 901; GFX9-NEXT: s_addk_i32 s2, 0xc60 902; GFX9-NEXT: v_mov_b32_e32 v0, s3 903; GFX9-NEXT: s_waitcnt lgkmcnt(0) 904; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 905; GFX9-NEXT: v_mov_b32_e32 v2, s2 906; GFX9-NEXT: s_waitcnt lgkmcnt(0) 907; GFX9-NEXT: v_mov_b32_e32 v3, s0 908; GFX9-NEXT: v_mov_b32_e32 v4, s0 909; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset1:1 910; GFX9-NEXT: ds_write2_b32 v2, v3, v4 offset1:1 911; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v1 912; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset1:1 913; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset0:32 offset1:33 914; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset0:64 offset1:65 915; GFX9-NEXT: s_endpgm 916 %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1 917 %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1 918 %val = load float, float addrspace(1)* %in 919 %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i 920 store float %val, float addrspace(3)* %arrayidx44, align 4 921 %add47 = add nsw i32 %x.i, 1 922 %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47 923 store float %val, float addrspace(3)* %arrayidx48, align 4 924 %add51 = add nsw i32 %x.i, 16 925 %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51 926 store float %val, float addrspace(3)* %arrayidx52, align 4 927 %add55 = add nsw i32 %x.i, 17 928 %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55 929 store float %val, float addrspace(3)* %arrayidx56, align 4 930 %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i 931 store float %val, float addrspace(3)* %arrayidx60, align 4 932 %add63 = add nsw i32 %y.i, 1 933 %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63 934 store float %val, float addrspace(3)* %arrayidx64, align 4 935 %add67 = add nsw i32 %y.i, 32 936 %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67 937 store float %val, float addrspace(3)* %arrayidx68, align 4 938 %add71 = add nsw i32 %y.i, 33 939 %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71 940 store float %val, float addrspace(3)* %arrayidx72, align 4 941 %add75 = add nsw i32 %y.i, 64 942 %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75 943 store float %val, float addrspace(3)* %arrayidx76, align 4 944 %add79 = add nsw i32 %y.i, 65 945 %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79 946 store float %val, float addrspace(3)* %arrayidx80, align 4 947 ret void 948} 949 950define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 { 951; CI-LABEL: simple_write2_v4f32_superreg_align4: 952; CI: ; %bb.0: 953; CI-NEXT: s_load_dword s4, s[0:1], 0x9 954; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 955; CI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 956; CI-NEXT: s_mov_b32 m0, -1 957; CI-NEXT: s_waitcnt lgkmcnt(0) 958; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 959; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 960; CI-NEXT: s_waitcnt lgkmcnt(0) 961; CI-NEXT: v_mov_b32_e32 v1, s0 962; CI-NEXT: v_mov_b32_e32 v2, s1 963; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 964; CI-NEXT: v_mov_b32_e32 v3, s2 965; CI-NEXT: v_mov_b32_e32 v1, s3 966; CI-NEXT: ds_write2_b32 v0, v3, v1 offset0:2 offset1:3 967; CI-NEXT: s_endpgm 968; 969; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4: 970; GFX9-ALIGNED: ; %bb.0: 971; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x24 972; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 973; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 974; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4 975; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 976; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 977; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, s0 978; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, s1 979; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v3, s2 980; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v4, s3 981; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 982; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v3, v4 offset0:2 offset1:3 983; GFX9-ALIGNED-NEXT: s_endpgm 984; 985; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4: 986; GFX9-UNALIGNED: ; %bb.0: 987; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x24 988; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 989; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 990; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v4, v0, 4, s4 991; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 992; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 993; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 994; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 995; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s2 996; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s3 997; GFX9-UNALIGNED-NEXT: ds_write_b128 v4, v[0:3] 998; GFX9-UNALIGNED-NEXT: s_endpgm 999 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 1000 %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in 1001 %val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4 1002 %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(3)* %out, i32 %x.i 1003 store <4 x float> %val0, <4 x float> addrspace(3)* %out.gep, align 4 1004 ret void 1005} 1006 1007@v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1 1008 1009define amdgpu_kernel void @write2_v2i32_align1_odd_offset() { 1010; CI-LABEL: write2_v2i32_align1_odd_offset: 1011; CI: ; %bb.0: ; %entry 1012; CI-NEXT: v_mov_b32_e32 v0, 0x7b 1013; CI-NEXT: v_mov_b32_e32 v1, 0 1014; CI-NEXT: s_mov_b32 m0, -1 1015; CI-NEXT: ds_write_b8 v1, v0 offset:65 1016; CI-NEXT: v_mov_b32_e32 v0, 1 1017; CI-NEXT: ds_write_b8 v1, v0 offset:70 1018; CI-NEXT: v_mov_b32_e32 v0, 0xc8 1019; CI-NEXT: ds_write_b8 v1, v0 offset:69 1020; CI-NEXT: ds_write_b8 v1, v1 offset:68 1021; CI-NEXT: ds_write_b8 v1, v1 offset:67 1022; CI-NEXT: ds_write_b8 v1, v1 offset:66 1023; CI-NEXT: ds_write_b8 v1, v1 offset:72 1024; CI-NEXT: ds_write_b8 v1, v1 offset:71 1025; CI-NEXT: s_endpgm 1026; 1027; GFX9-ALIGNED-LABEL: write2_v2i32_align1_odd_offset: 1028; GFX9-ALIGNED: ; %bb.0: ; %entry 1029; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b 1030; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, 0 1031; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:65 1032; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 1 1033; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:70 1034; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0xc8 1035; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:69 1036; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:68 1037; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:67 1038; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:66 1039; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:72 1040; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:71 1041; GFX9-ALIGNED-NEXT: s_endpgm 1042; 1043; GFX9-UNALIGNED-LABEL: write2_v2i32_align1_odd_offset: 1044; GFX9-UNALIGNED: ; %bb.0: ; %entry 1045; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x41 1046; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0x7b 1047; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x1c8 1048; GFX9-UNALIGNED-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 1049; GFX9-UNALIGNED-NEXT: s_endpgm 1050entry: 1051 store <2 x i32> <i32 123, i32 456>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1 1052 ret void 1053} 1054 1055declare i32 @llvm.amdgcn.workgroup.id.x() #1 1056declare i32 @llvm.amdgcn.workgroup.id.y() #1 1057declare i32 @llvm.amdgcn.workitem.id.x() #1 1058declare i32 @llvm.amdgcn.workitem.id.y() #1 1059 1060attributes #0 = { nounwind } 1061attributes #1 = { nounwind readnone speculatable } 1062attributes #2 = { convergent nounwind } 1063