1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope --check-prefix=CI %s 3; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-ALIGNED %s 4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-UNALIGNED %s 5 6@lds = addrspace(3) global [512 x float] undef, align 4 7@lds.f64 = addrspace(3) global [512 x double] undef, align 8 8 9define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { 10; CI-LABEL: simple_write2_one_val_f32: 11; CI: ; %bb.0: 12; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 13; CI-NEXT: s_mov_b32 s3, 0xf000 14; CI-NEXT: s_mov_b32 s2, 0 15; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 16; CI-NEXT: v_mov_b32_e32 v1, 0 17; CI-NEXT: s_waitcnt lgkmcnt(0) 18; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 19; CI-NEXT: s_mov_b32 m0, -1 20; CI-NEXT: s_waitcnt vmcnt(0) 21; CI-NEXT: ds_write2_b32 v0, v1, v1 offset1:8 22; CI-NEXT: s_endpgm 23; 24; GFX9-LABEL: simple_write2_one_val_f32: 25; GFX9: ; %bb.0: 26; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 27; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 28; GFX9-NEXT: s_waitcnt lgkmcnt(0) 29; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 30; GFX9-NEXT: s_waitcnt vmcnt(0) 31; GFX9-NEXT: ds_write2_b32 v0, v1, v1 offset1:8 32; GFX9-NEXT: s_endpgm 33 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 34 %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i 35 %val = load float, float addrspace(1)* %in.gep, align 4 36 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 37 store float %val, float addrspace(3)* %arrayidx0, align 4 38 %add.x = add nsw i32 %x.i, 8 39 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 40 store float %val, float addrspace(3)* %arrayidx1, align 4 41 ret void 42} 43 44define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { 45; CI-LABEL: simple_write2_two_val_f32: 46; CI: ; %bb.0: 47; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 48; CI-NEXT: s_mov_b32 s3, 0xf000 49; CI-NEXT: s_mov_b32 s2, 0 50; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 51; CI-NEXT: v_mov_b32_e32 v1, 0 52; CI-NEXT: s_waitcnt lgkmcnt(0) 53; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc 54; CI-NEXT: s_waitcnt vmcnt(0) 55; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc 56; CI-NEXT: s_waitcnt vmcnt(0) 57; CI-NEXT: s_mov_b32 m0, -1 58; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:8 59; CI-NEXT: s_endpgm 60; 61; GFX9-LABEL: simple_write2_two_val_f32: 62; GFX9: ; %bb.0: 63; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 64; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 65; GFX9-NEXT: s_waitcnt lgkmcnt(0) 66; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc 67; GFX9-NEXT: s_waitcnt vmcnt(0) 68; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc 69; GFX9-NEXT: s_waitcnt vmcnt(0) 70; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8 71; GFX9-NEXT: s_endpgm 72 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 73 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i 74 %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 75 %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4 76 %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4 77 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 78 store float %val0, float addrspace(3)* %arrayidx0, align 4 79 %add.x = add nsw i32 %x.i, 8 80 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 81 store float %val1, float addrspace(3)* %arrayidx1, align 4 82 ret void 83} 84 85define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 86; CI-LABEL: simple_write2_two_val_f32_volatile_0: 87; CI: ; %bb.0: 88; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 89; CI-NEXT: s_mov_b32 s3, 0xf000 90; CI-NEXT: s_mov_b32 s2, 0 91; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 92; CI-NEXT: v_mov_b32_e32 v1, 0 93; CI-NEXT: s_waitcnt lgkmcnt(0) 94; CI-NEXT: s_mov_b64 s[0:1], s[4:5] 95; CI-NEXT: s_mov_b64 s[4:5], s[6:7] 96; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 97; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc 98; CI-NEXT: s_waitcnt vmcnt(0) 99; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 glc 100; CI-NEXT: s_waitcnt vmcnt(0) 101; CI-NEXT: s_mov_b32 m0, -1 102; CI-NEXT: ds_write_b32 v0, v2 103; CI-NEXT: ds_write_b32 v0, v1 offset:32 104; CI-NEXT: s_endpgm 105; 106; GFX9-LABEL: simple_write2_two_val_f32_volatile_0: 107; GFX9: ; %bb.0: 108; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 109; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 110; GFX9-NEXT: s_waitcnt lgkmcnt(0) 111; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc 112; GFX9-NEXT: s_waitcnt vmcnt(0) 113; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc 114; GFX9-NEXT: s_waitcnt vmcnt(0) 115; GFX9-NEXT: ds_write_b32 v0, v1 116; GFX9-NEXT: ds_write_b32 v0, v2 offset:32 117; GFX9-NEXT: s_endpgm 118 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 119 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 120 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 121 %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4 122 %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4 123 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 124 store volatile float %val0, float addrspace(3)* %arrayidx0, align 4 125 %add.x = add nsw i32 %x.i, 8 126 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 127 store float %val1, float addrspace(3)* %arrayidx1, align 4 128 ret void 129} 130 131define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 132; CI-LABEL: simple_write2_two_val_f32_volatile_1: 133; CI: ; %bb.0: 134; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 135; CI-NEXT: s_mov_b32 s3, 0xf000 136; CI-NEXT: s_mov_b32 s2, 0 137; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 138; CI-NEXT: v_mov_b32_e32 v1, 0 139; CI-NEXT: s_waitcnt lgkmcnt(0) 140; CI-NEXT: s_mov_b64 s[0:1], s[4:5] 141; CI-NEXT: s_mov_b64 s[4:5], s[6:7] 142; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 143; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc 144; CI-NEXT: s_waitcnt vmcnt(0) 145; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 glc 146; CI-NEXT: s_waitcnt vmcnt(0) 147; CI-NEXT: s_mov_b32 m0, -1 148; CI-NEXT: ds_write_b32 v0, v2 149; CI-NEXT: ds_write_b32 v0, v1 offset:32 150; CI-NEXT: s_endpgm 151; 152; GFX9-LABEL: simple_write2_two_val_f32_volatile_1: 153; GFX9: ; %bb.0: 154; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 155; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 156; GFX9-NEXT: s_waitcnt lgkmcnt(0) 157; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc 158; GFX9-NEXT: s_waitcnt vmcnt(0) 159; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc 160; GFX9-NEXT: s_waitcnt vmcnt(0) 161; GFX9-NEXT: ds_write_b32 v0, v1 162; GFX9-NEXT: ds_write_b32 v0, v2 offset:32 163; GFX9-NEXT: s_endpgm 164 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 165 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 166 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 167 %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4 168 %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4 169 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 170 store float %val0, float addrspace(3)* %arrayidx0, align 4 171 %add.x = add nsw i32 %x.i, 8 172 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 173 store volatile float %val1, float addrspace(3)* %arrayidx1, align 4 174 ret void 175} 176 177; 2 data subregisters from different super registers. 178; TODO: GFX9 has v_mov_b32_e32 v2, lds@abs32@lo 179; This should be an s_mov_b32. The v_mov_b32 gets introduced by an 180; early legalization of the constant bus constraint on the v_lshl_add_u32, 181; and then SIFoldOperands folds in an unlucky order. 182define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { 183; CI-LABEL: simple_write2_two_val_subreg2_mixed_f32: 184; CI: ; %bb.0: 185; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 186; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 187; CI-NEXT: s_mov_b32 s3, 0xf000 188; CI-NEXT: s_mov_b32 s2, 0 189; CI-NEXT: v_mov_b32_e32 v2, 0 190; CI-NEXT: s_waitcnt lgkmcnt(0) 191; CI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64 glc 192; CI-NEXT: s_waitcnt vmcnt(0) 193; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 offset:8 glc 194; CI-NEXT: s_waitcnt vmcnt(0) 195; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 196; CI-NEXT: s_mov_b32 m0, -1 197; CI-NEXT: ds_write2_b32 v0, v3, v2 offset1:8 198; CI-NEXT: s_endpgm 199; 200; GFX9-LABEL: simple_write2_two_val_subreg2_mixed_f32: 201; GFX9: ; %bb.0: 202; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 203; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 204; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 205; GFX9-NEXT: s_waitcnt lgkmcnt(0) 206; GFX9-NEXT: global_load_dwordx2 v[1:2], v5, s[0:1] glc 207; GFX9-NEXT: s_waitcnt vmcnt(0) 208; GFX9-NEXT: global_load_dwordx2 v[3:4], v5, s[0:1] offset:8 glc 209; GFX9-NEXT: s_waitcnt vmcnt(0) 210; GFX9-NEXT: ds_write2_b32 v0, v1, v4 offset1:8 211; GFX9-NEXT: s_endpgm 212 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 213 %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i 214 %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1 215 %val0 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8 216 %val1 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8 217 %val0.0 = extractelement <2 x float> %val0, i32 0 218 %val1.1 = extractelement <2 x float> %val1, i32 1 219 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 220 store float %val0.0, float addrspace(3)* %arrayidx0, align 4 221 %add.x = add nsw i32 %x.i, 8 222 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 223 store float %val1.1, float addrspace(3)* %arrayidx1, align 4 224 ret void 225} 226 227define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { 228; CI-LABEL: simple_write2_two_val_subreg2_f32: 229; CI: ; %bb.0: 230; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 231; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 232; CI-NEXT: s_mov_b32 s3, 0xf000 233; CI-NEXT: s_mov_b32 s2, 0 234; CI-NEXT: v_mov_b32_e32 v2, 0 235; CI-NEXT: s_waitcnt lgkmcnt(0) 236; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 237; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 238; CI-NEXT: s_mov_b32 m0, -1 239; CI-NEXT: s_waitcnt vmcnt(0) 240; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:8 241; CI-NEXT: s_endpgm 242; 243; GFX9-LABEL: simple_write2_two_val_subreg2_f32: 244; GFX9: ; %bb.0: 245; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 246; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v0 247; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 248; GFX9-NEXT: s_waitcnt lgkmcnt(0) 249; GFX9-NEXT: global_load_dwordx2 v[1:2], v1, s[0:1] 250; GFX9-NEXT: s_waitcnt vmcnt(0) 251; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8 252; GFX9-NEXT: s_endpgm 253 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 254 %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i 255 %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8 256 %val0 = extractelement <2 x float> %val, i32 0 257 %val1 = extractelement <2 x float> %val, i32 1 258 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 259 store float %val0, float addrspace(3)* %arrayidx0, align 4 260 %add.x = add nsw i32 %x.i, 8 261 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 262 store float %val1, float addrspace(3)* %arrayidx1, align 4 263 ret void 264} 265 266define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 { 267; CI-LABEL: simple_write2_two_val_subreg4_f32: 268; CI: ; %bb.0: 269; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 270; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v0 271; CI-NEXT: s_mov_b32 s3, 0xf000 272; CI-NEXT: s_mov_b32 s2, 0 273; CI-NEXT: v_mov_b32_e32 v2, 0 274; CI-NEXT: s_waitcnt lgkmcnt(0) 275; CI-NEXT: buffer_load_dwordx4 v[1:4], v[1:2], s[0:3], 0 addr64 276; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 277; CI-NEXT: s_mov_b32 m0, -1 278; CI-NEXT: s_waitcnt vmcnt(0) 279; CI-NEXT: ds_write2_b32 v0, v1, v4 offset1:8 280; CI-NEXT: s_endpgm 281; 282; GFX9-LABEL: simple_write2_two_val_subreg4_f32: 283; GFX9: ; %bb.0: 284; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 285; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v0 286; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 287; GFX9-NEXT: s_waitcnt lgkmcnt(0) 288; GFX9-NEXT: global_load_dwordx4 v[1:4], v1, s[0:1] 289; GFX9-NEXT: s_waitcnt vmcnt(0) 290; GFX9-NEXT: ds_write2_b32 v0, v1, v4 offset1:8 291; GFX9-NEXT: s_endpgm 292 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 293 %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i 294 %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16 295 %val0 = extractelement <4 x float> %val, i32 0 296 %val1 = extractelement <4 x float> %val, i32 3 297 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 298 store float %val0, float addrspace(3)* %arrayidx0, align 4 299 %add.x = add nsw i32 %x.i, 8 300 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 301 store float %val1, float addrspace(3)* %arrayidx1, align 4 302 ret void 303} 304 305define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { 306; CI-LABEL: simple_write2_two_val_max_offset_f32: 307; CI: ; %bb.0: 308; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 309; CI-NEXT: s_mov_b32 s3, 0xf000 310; CI-NEXT: s_mov_b32 s2, 0 311; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 312; CI-NEXT: v_mov_b32_e32 v1, 0 313; CI-NEXT: s_waitcnt lgkmcnt(0) 314; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc 315; CI-NEXT: s_waitcnt vmcnt(0) 316; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc 317; CI-NEXT: s_waitcnt vmcnt(0) 318; CI-NEXT: s_mov_b32 m0, -1 319; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:255 320; CI-NEXT: s_endpgm 321; 322; GFX9-LABEL: simple_write2_two_val_max_offset_f32: 323; GFX9: ; %bb.0: 324; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 325; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 326; GFX9-NEXT: s_waitcnt lgkmcnt(0) 327; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc 328; GFX9-NEXT: s_waitcnt vmcnt(0) 329; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc 330; GFX9-NEXT: s_waitcnt vmcnt(0) 331; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:255 332; GFX9-NEXT: s_endpgm 333 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 334 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i 335 %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 336 %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4 337 %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4 338 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 339 store float %val0, float addrspace(3)* %arrayidx0, align 4 340 %add.x = add nsw i32 %x.i, 255 341 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 342 store float %val1, float addrspace(3)* %arrayidx1, align 4 343 ret void 344} 345 346define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 347; CI-LABEL: simple_write2_two_val_too_far_f32: 348; CI: ; %bb.0: 349; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 350; CI-NEXT: s_mov_b32 s3, 0xf000 351; CI-NEXT: s_mov_b32 s2, 0 352; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 353; CI-NEXT: v_mov_b32_e32 v1, 0 354; CI-NEXT: s_waitcnt lgkmcnt(0) 355; CI-NEXT: s_mov_b64 s[0:1], s[4:5] 356; CI-NEXT: s_mov_b64 s[4:5], s[6:7] 357; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 358; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 359; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 360; CI-NEXT: s_mov_b32 m0, -1 361; CI-NEXT: s_waitcnt vmcnt(1) 362; CI-NEXT: ds_write_b32 v0, v2 363; CI-NEXT: s_waitcnt vmcnt(0) 364; CI-NEXT: ds_write_b32 v0, v1 offset:1028 365; CI-NEXT: s_endpgm 366; 367; GFX9-LABEL: simple_write2_two_val_too_far_f32: 368; GFX9: ; %bb.0: 369; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 370; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 371; GFX9-NEXT: s_waitcnt lgkmcnt(0) 372; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 373; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 374; GFX9-NEXT: s_waitcnt vmcnt(1) 375; GFX9-NEXT: ds_write_b32 v0, v1 376; GFX9-NEXT: s_waitcnt vmcnt(0) 377; GFX9-NEXT: ds_write_b32 v0, v2 offset:1028 378; GFX9-NEXT: s_endpgm 379 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 380 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 381 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 382 %val0 = load float, float addrspace(1)* %in0.gep, align 4 383 %val1 = load float, float addrspace(1)* %in1.gep, align 4 384 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 385 store float %val0, float addrspace(3)* %arrayidx0, align 4 386 %add.x = add nsw i32 %x.i, 257 387 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 388 store float %val1, float addrspace(3)* %arrayidx1, align 4 389 ret void 390} 391 392define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 393; CI-LABEL: simple_write2_two_val_f32_x2: 394; CI: ; %bb.0: 395; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 396; CI-NEXT: s_mov_b32 s3, 0xf000 397; CI-NEXT: s_mov_b32 s2, 0 398; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 399; CI-NEXT: v_mov_b32_e32 v1, 0 400; CI-NEXT: s_waitcnt lgkmcnt(0) 401; CI-NEXT: s_mov_b64 s[0:1], s[4:5] 402; CI-NEXT: s_mov_b64 s[4:5], s[6:7] 403; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 404; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 405; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 406; CI-NEXT: s_mov_b32 m0, -1 407; CI-NEXT: s_waitcnt vmcnt(0) 408; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:8 409; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:11 offset1:27 410; CI-NEXT: s_endpgm 411; 412; GFX9-LABEL: simple_write2_two_val_f32_x2: 413; GFX9: ; %bb.0: 414; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 415; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 416; GFX9-NEXT: s_waitcnt lgkmcnt(0) 417; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 418; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 419; GFX9-NEXT: s_waitcnt vmcnt(0) 420; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8 421; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27 422; GFX9-NEXT: s_endpgm 423 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 424 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x 425 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x 426 %val0 = load float, float addrspace(1)* %in0.gep, align 4 427 %val1 = load float, float addrspace(1)* %in1.gep, align 4 428 429 %idx.0 = add nsw i32 %tid.x, 0 430 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 431 store float %val0, float addrspace(3)* %arrayidx0, align 4 432 433 %idx.1 = add nsw i32 %tid.x, 8 434 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 435 store float %val1, float addrspace(3)* %arrayidx1, align 4 436 437 %idx.2 = add nsw i32 %tid.x, 11 438 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 439 store float %val0, float addrspace(3)* %arrayidx2, align 4 440 441 %idx.3 = add nsw i32 %tid.x, 27 442 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 443 store float %val1, float addrspace(3)* %arrayidx3, align 4 444 445 ret void 446} 447 448define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 449; CI-LABEL: simple_write2_two_val_f32_x2_nonzero_base: 450; CI: ; %bb.0: 451; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 452; CI-NEXT: s_mov_b32 s3, 0xf000 453; CI-NEXT: s_mov_b32 s2, 0 454; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 455; CI-NEXT: v_mov_b32_e32 v1, 0 456; CI-NEXT: s_waitcnt lgkmcnt(0) 457; CI-NEXT: s_mov_b64 s[0:1], s[4:5] 458; CI-NEXT: s_mov_b64 s[4:5], s[6:7] 459; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 460; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 461; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 462; CI-NEXT: s_mov_b32 m0, -1 463; CI-NEXT: s_waitcnt vmcnt(0) 464; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:3 offset1:8 465; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:11 offset1:27 466; CI-NEXT: s_endpgm 467; 468; GFX9-LABEL: simple_write2_two_val_f32_x2_nonzero_base: 469; GFX9: ; %bb.0: 470; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 471; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 472; GFX9-NEXT: s_waitcnt lgkmcnt(0) 473; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 474; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 475; GFX9-NEXT: s_waitcnt vmcnt(0) 476; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:8 477; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27 478; GFX9-NEXT: s_endpgm 479 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 480 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x 481 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x 482 %val0 = load float, float addrspace(1)* %in0.gep, align 4 483 %val1 = load float, float addrspace(1)* %in1.gep, align 4 484 485 %idx.0 = add nsw i32 %tid.x, 3 486 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 487 store float %val0, float addrspace(3)* %arrayidx0, align 4 488 489 %idx.1 = add nsw i32 %tid.x, 8 490 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 491 store float %val1, float addrspace(3)* %arrayidx1, align 4 492 493 %idx.2 = add nsw i32 %tid.x, 11 494 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 495 store float %val0, float addrspace(3)* %arrayidx2, align 4 496 497 %idx.3 = add nsw i32 %tid.x, 27 498 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 499 store float %val1, float addrspace(3)* %arrayidx3, align 4 500 501 ret void 502} 503 504define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 { 505; CI-LABEL: write2_ptr_subreg_arg_two_val_f32: 506; CI: ; %bb.0: 507; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 508; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf 509; CI-NEXT: s_mov_b32 s3, 0xf000 510; CI-NEXT: s_mov_b32 s2, 0 511; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 512; CI-NEXT: s_waitcnt lgkmcnt(0) 513; CI-NEXT: s_mov_b64 s[0:1], s[4:5] 514; CI-NEXT: v_mov_b32_e32 v1, 0 515; CI-NEXT: s_mov_b64 s[4:5], s[6:7] 516; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 517; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 518; CI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 519; CI-NEXT: v_mov_b32_e32 v1, s8 520; CI-NEXT: s_mov_b32 m0, -1 521; CI-NEXT: v_mov_b32_e32 v3, s9 522; CI-NEXT: s_waitcnt vmcnt(1) 523; CI-NEXT: ds_write_b32 v1, v2 offset:32 524; CI-NEXT: s_waitcnt vmcnt(0) 525; CI-NEXT: ds_write_b32 v3, v0 offset:32 526; CI-NEXT: s_endpgm 527; 528; GFX9-LABEL: write2_ptr_subreg_arg_two_val_f32: 529; GFX9: ; %bb.0: 530; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 531; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c 532; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 533; GFX9-NEXT: s_waitcnt lgkmcnt(0) 534; GFX9-NEXT: global_load_dword v1, v0, s[4:5] 535; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 536; GFX9-NEXT: v_mov_b32_e32 v0, s2 537; GFX9-NEXT: v_mov_b32_e32 v3, s3 538; GFX9-NEXT: s_waitcnt vmcnt(1) 539; GFX9-NEXT: ds_write_b32 v0, v1 offset:32 540; GFX9-NEXT: s_waitcnt vmcnt(0) 541; GFX9-NEXT: ds_write_b32 v3, v2 offset:32 542; GFX9-NEXT: s_endpgm 543 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 544 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 545 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 546 %val0 = load float, float addrspace(1)* %in0.gep, align 4 547 %val1 = load float, float addrspace(1)* %in1.gep, align 4 548 549 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 550 %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 551 %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1 552 %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 553 %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 554 555 ; Apply an additional offset after the vector that will be more obviously folded. 556 %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8 557 store float %val0, float addrspace(3)* %gep.0, align 4 558 559 %add.x = add nsw i32 %x.i, 8 560 store float %val1, float addrspace(3)* %gep.1.offset, align 4 561 ret void 562} 563 564define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { 565; CI-LABEL: simple_write2_one_val_f64: 566; CI: ; %bb.0: 567; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 568; CI-NEXT: s_mov_b32 s3, 0xf000 569; CI-NEXT: s_mov_b32 s2, 0 570; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 571; CI-NEXT: v_mov_b32_e32 v1, 0 572; CI-NEXT: s_waitcnt lgkmcnt(0) 573; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64 574; CI-NEXT: s_mov_b32 m0, -1 575; CI-NEXT: s_waitcnt vmcnt(0) 576; CI-NEXT: ds_write2_b64 v0, v[1:2], v[1:2] offset1:8 577; CI-NEXT: s_endpgm 578; 579; GFX9-LABEL: simple_write2_one_val_f64: 580; GFX9: ; %bb.0: 581; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 582; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 583; GFX9-NEXT: s_waitcnt lgkmcnt(0) 584; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] 585; GFX9-NEXT: s_waitcnt vmcnt(0) 586; GFX9-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:8 587; GFX9-NEXT: s_endpgm 588 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 589 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i 590 %val = load double, double addrspace(1)* %in.gep, align 8 591 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i 592 store double %val, double addrspace(3)* %arrayidx0, align 8 593 %add.x = add nsw i32 %x.i, 8 594 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x 595 store double %val, double addrspace(3)* %arrayidx1, align 8 596 ret void 597} 598 599define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { 600; CI-LABEL: misaligned_simple_write2_one_val_f64: 601; CI: ; %bb.0: 602; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 603; CI-NEXT: s_load_dword s0, s[0:1], 0xd 604; CI-NEXT: s_mov_b32 s7, 0xf000 605; CI-NEXT: s_mov_b32 s6, 0 606; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 607; CI-NEXT: v_mov_b32_e32 v1, 0 608; CI-NEXT: s_waitcnt lgkmcnt(0) 609; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64 610; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 611; CI-NEXT: s_mov_b32 m0, -1 612; CI-NEXT: s_waitcnt vmcnt(0) 613; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 614; CI-NEXT: ds_write2_b32 v0, v1, v2 offset0:14 offset1:15 615; CI-NEXT: s_endpgm 616; 617; GFX9-LABEL: misaligned_simple_write2_one_val_f64: 618; GFX9: ; %bb.0: 619; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 620; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 621; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 622; GFX9-NEXT: s_waitcnt lgkmcnt(0) 623; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 624; GFX9-NEXT: v_add_u32_e32 v2, s4, v2 625; GFX9-NEXT: s_waitcnt vmcnt(0) 626; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 627; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset0:14 offset1:15 628; GFX9-NEXT: s_endpgm 629 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 630 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i 631 %val = load double, double addrspace(1)* %in.gep, align 8 632 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i 633 store double %val, double addrspace(3)* %arrayidx0, align 4 634 %add.x = add nsw i32 %x.i, 7 635 %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x 636 store double %val, double addrspace(3)* %arrayidx1, align 4 637 ret void 638} 639 640define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { 641; CI-LABEL: unaligned_offset_simple_write2_one_val_f64: 642; CI: ; %bb.0: 643; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 644; CI-NEXT: s_load_dword s0, s[0:1], 0xd 645; CI-NEXT: s_mov_b32 s7, 0xf000 646; CI-NEXT: s_mov_b32 s6, 0 647; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 648; CI-NEXT: v_mov_b32_e32 v1, 0 649; CI-NEXT: s_waitcnt lgkmcnt(0) 650; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64 651; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 652; CI-NEXT: s_mov_b32 m0, -1 653; CI-NEXT: s_waitcnt vmcnt(0) 654; CI-NEXT: v_lshrrev_b32_e32 v3, 24, v1 655; CI-NEXT: ds_write_b8 v0, v1 offset:5 656; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 657; CI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 658; CI-NEXT: ds_write_b8 v0, v2 offset:13 659; CI-NEXT: ds_write_b8 v0, v1 offset:9 660; CI-NEXT: v_lshrrev_b32_e32 v1, 24, v2 661; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 662; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 663; CI-NEXT: ds_write_b8 v0, v3 offset:8 664; CI-NEXT: ds_write_b8 v0, v4 offset:7 665; CI-NEXT: ds_write_b8 v0, v5 offset:6 666; CI-NEXT: ds_write_b8 v0, v1 offset:16 667; CI-NEXT: ds_write_b8 v0, v6 offset:15 668; CI-NEXT: ds_write_b8 v0, v2 offset:14 669; CI-NEXT: ds_write_b8 v0, v3 offset:12 670; CI-NEXT: ds_write_b8 v0, v4 offset:11 671; CI-NEXT: ds_write_b8 v0, v5 offset:10 672; CI-NEXT: s_endpgm 673; 674; GFX9-ALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: 675; GFX9-ALIGNED: ; %bb.0: 676; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 677; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x34 678; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 679; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 680; GFX9-ALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 681; GFX9-ALIGNED-NEXT: v_add_u32_e32 v2, s4, v2 682; GFX9-ALIGNED-NEXT: s_waitcnt vmcnt(0) 683; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v3, 24, v0 684; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:7 685; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:5 686; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v0 687; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v1 offset:15 688; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:13 689; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:11 690; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:9 691; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v0, 24, v1 692; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v1, 8, v1 693; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v3 offset:8 694; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v4 offset:6 695; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:16 696; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:14 697; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v3 offset:12 698; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v4 offset:10 699; GFX9-ALIGNED-NEXT: s_endpgm 700; 701; GFX9-UNALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: 702; GFX9-UNALIGNED: ; %bb.0: 703; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 704; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x34 705; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 706; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 707; GFX9-UNALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 708; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, s4, v2 709; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v3, 5, v2 710; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, 9, v2 711; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) 712; GFX9-UNALIGNED-NEXT: ds_write2_b32 v3, v0, v1 offset1:1 713; GFX9-UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 714; GFX9-UNALIGNED-NEXT: s_endpgm 715 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 716 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i 717 %val = load double, double addrspace(1)* %in.gep, align 8 718 %base = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i 719 %base.i8 = bitcast double addrspace(3)* %base to i8 addrspace(3)* 720 %addr0.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 5 721 %addr0 = bitcast i8 addrspace(3)* %addr0.i8 to double addrspace(3)* 722 store double %val, double addrspace(3)* %addr0, align 1 723 %addr1.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 9 724 %addr1 = bitcast i8 addrspace(3)* %addr1.i8 to double addrspace(3)* 725 store double %val, double addrspace(3)* %addr1, align 1 726 ret void 727} 728 729define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { 730; CI-LABEL: simple_write2_two_val_f64: 731; CI: ; %bb.0: 732; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 733; CI-NEXT: s_mov_b32 s3, 0xf000 734; CI-NEXT: s_mov_b32 s2, 0 735; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 736; CI-NEXT: v_mov_b32_e32 v1, 0 737; CI-NEXT: s_waitcnt lgkmcnt(0) 738; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc 739; CI-NEXT: s_waitcnt vmcnt(0) 740; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 glc 741; CI-NEXT: s_waitcnt vmcnt(0) 742; CI-NEXT: s_mov_b32 m0, -1 743; CI-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:8 744; CI-NEXT: s_endpgm 745; 746; GFX9-LABEL: simple_write2_two_val_f64: 747; GFX9: ; %bb.0: 748; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 749; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 750; GFX9-NEXT: s_waitcnt lgkmcnt(0) 751; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc 752; GFX9-NEXT: s_waitcnt vmcnt(0) 753; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc 754; GFX9-NEXT: s_waitcnt vmcnt(0) 755; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:8 756; GFX9-NEXT: s_endpgm 757 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 758 %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i 759 %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1 760 %val0 = load volatile double, double addrspace(1)* %in.gep.0, align 8 761 %val1 = load volatile double, double addrspace(1)* %in.gep.1, align 8 762 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i 763 store double %val0, double addrspace(3)* %arrayidx0, align 8 764 %add.x = add nsw i32 %x.i, 8 765 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x 766 store double %val1, double addrspace(3)* %arrayidx1, align 8 767 ret void 768} 769 770@foo = addrspace(3) global [4 x i32] undef, align 4 771 772define amdgpu_kernel void @store_constant_adjacent_offsets() { 773; CI-LABEL: store_constant_adjacent_offsets: 774; CI: ; %bb.0: 775; CI-NEXT: s_movk_i32 s0, 0x7b 776; CI-NEXT: v_mov_b32_e32 v0, 0 777; CI-NEXT: v_mov_b32_e32 v1, s0 778; CI-NEXT: v_mov_b32_e32 v2, s0 779; CI-NEXT: s_mov_b32 m0, -1 780; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 781; CI-NEXT: s_endpgm 782; 783; GFX9-LABEL: store_constant_adjacent_offsets: 784; GFX9: ; %bb.0: 785; GFX9-NEXT: s_movk_i32 s0, 0x7b 786; GFX9-NEXT: v_mov_b32_e32 v0, 0 787; GFX9-NEXT: v_mov_b32_e32 v1, s0 788; GFX9-NEXT: v_mov_b32_e32 v2, s0 789; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 790; GFX9-NEXT: s_endpgm 791 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 792 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 793 ret void 794} 795 796define amdgpu_kernel void @store_constant_disjoint_offsets() { 797; CI-LABEL: store_constant_disjoint_offsets: 798; CI: ; %bb.0: 799; CI-NEXT: v_mov_b32_e32 v0, 0x7b 800; CI-NEXT: v_mov_b32_e32 v1, 0 801; CI-NEXT: s_mov_b32 m0, -1 802; CI-NEXT: ds_write2_b32 v1, v0, v0 offset1:2 803; CI-NEXT: s_endpgm 804; 805; GFX9-LABEL: store_constant_disjoint_offsets: 806; GFX9: ; %bb.0: 807; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b 808; GFX9-NEXT: v_mov_b32_e32 v1, 0 809; GFX9-NEXT: ds_write2_b32 v1, v0, v0 offset1:2 810; GFX9-NEXT: s_endpgm 811 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 812 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 813 ret void 814} 815 816@bar = addrspace(3) global [4 x i64] undef, align 4 817 818define amdgpu_kernel void @store_misaligned64_constant_offsets() { 819; CI-LABEL: store_misaligned64_constant_offsets: 820; CI: ; %bb.0: 821; CI-NEXT: v_mov_b32_e32 v0, 0 822; CI-NEXT: v_mov_b32_e32 v1, 0x7b 823; CI-NEXT: s_mov_b32 m0, -1 824; CI-NEXT: ds_write2_b32 v0, v1, v0 offset1:1 825; CI-NEXT: ds_write2_b32 v0, v1, v0 offset0:2 offset1:3 826; CI-NEXT: s_endpgm 827; 828; GFX9-ALIGNED-LABEL: store_misaligned64_constant_offsets: 829; GFX9-ALIGNED: ; %bb.0: 830; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0 831; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, 0x7b 832; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v1, v0 offset1:1 833; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v1, v0 offset0:2 offset1:3 834; GFX9-ALIGNED-NEXT: s_endpgm 835; 836; GFX9-UNALIGNED-LABEL: store_misaligned64_constant_offsets: 837; GFX9-UNALIGNED: ; %bb.0: 838; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b 839; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0 840; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, v0 841; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, v1 842; GFX9-UNALIGNED-NEXT: ds_write_b128 v1, v[0:3] 843; GFX9-UNALIGNED-NEXT: s_endpgm 844 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 845 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 846 ret void 847} 848 849@bar.large = addrspace(3) global [4096 x i64] undef, align 4 850 851define amdgpu_kernel void @store_misaligned64_constant_large_offsets() { 852; CI-LABEL: store_misaligned64_constant_large_offsets: 853; CI: ; %bb.0: 854; CI-NEXT: v_mov_b32_e32 v0, 0x4000 855; CI-NEXT: v_mov_b32_e32 v1, 0x7b 856; CI-NEXT: v_mov_b32_e32 v2, 0 857; CI-NEXT: s_mov_b32 m0, -1 858; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 859; CI-NEXT: v_mov_b32_e32 v0, 0x7ff8 860; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 861; CI-NEXT: s_endpgm 862; 863; GFX9-LABEL: store_misaligned64_constant_large_offsets: 864; GFX9: ; %bb.0: 865; GFX9-NEXT: v_mov_b32_e32 v0, 0x4000 866; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b 867; GFX9-NEXT: v_mov_b32_e32 v2, 0 868; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 869; GFX9-NEXT: v_mov_b32_e32 v0, 0x7ff8 870; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 871; GFX9-NEXT: s_endpgm 872 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 873 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 874 ret void 875} 876 877@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4 878@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4 879 880define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 { 881; CI-LABEL: write2_sgemm_sequence: 882; CI: ; %bb.0: 883; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 884; CI-NEXT: s_lshl_b32 s2, s2, 2 885; CI-NEXT: s_add_i32 s3, s2, 0xc20 886; CI-NEXT: v_mov_b32_e32 v0, s3 887; CI-NEXT: s_addk_i32 s2, 0xc60 888; CI-NEXT: s_waitcnt lgkmcnt(0) 889; CI-NEXT: s_load_dword s0, s[0:1], 0x0 890; CI-NEXT: s_mov_b32 m0, -1 891; CI-NEXT: s_waitcnt lgkmcnt(0) 892; CI-NEXT: v_mov_b32_e32 v2, s0 893; CI-NEXT: v_mov_b32_e32 v3, s0 894; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 895; CI-NEXT: v_mov_b32_e32 v0, s2 896; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 897; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v1 898; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 899; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:32 offset1:33 900; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:64 offset1:65 901; CI-NEXT: s_endpgm 902; 903; GFX9-LABEL: write2_sgemm_sequence: 904; GFX9: ; %bb.0: 905; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 906; GFX9-NEXT: s_lshl_b32 s2, s2, 2 907; GFX9-NEXT: s_add_i32 s3, s2, 0xc20 908; GFX9-NEXT: s_addk_i32 s2, 0xc60 909; GFX9-NEXT: v_mov_b32_e32 v0, s3 910; GFX9-NEXT: s_waitcnt lgkmcnt(0) 911; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 912; GFX9-NEXT: v_mov_b32_e32 v2, s2 913; GFX9-NEXT: s_waitcnt lgkmcnt(0) 914; GFX9-NEXT: v_mov_b32_e32 v3, s0 915; GFX9-NEXT: v_mov_b32_e32 v4, s0 916; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset1:1 917; GFX9-NEXT: ds_write2_b32 v2, v3, v4 offset1:1 918; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v1 919; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset1:1 920; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset0:32 offset1:33 921; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset0:64 offset1:65 922; GFX9-NEXT: s_endpgm 923 %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1 924 %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1 925 %val = load float, float addrspace(1)* %in 926 %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i 927 store float %val, float addrspace(3)* %arrayidx44, align 4 928 %add47 = add nsw i32 %x.i, 1 929 %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47 930 store float %val, float addrspace(3)* %arrayidx48, align 4 931 %add51 = add nsw i32 %x.i, 16 932 %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51 933 store float %val, float addrspace(3)* %arrayidx52, align 4 934 %add55 = add nsw i32 %x.i, 17 935 %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55 936 store float %val, float addrspace(3)* %arrayidx56, align 4 937 %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i 938 store float %val, float addrspace(3)* %arrayidx60, align 4 939 %add63 = add nsw i32 %y.i, 1 940 %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63 941 store float %val, float addrspace(3)* %arrayidx64, align 4 942 %add67 = add nsw i32 %y.i, 32 943 %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67 944 store float %val, float addrspace(3)* %arrayidx68, align 4 945 %add71 = add nsw i32 %y.i, 33 946 %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71 947 store float %val, float addrspace(3)* %arrayidx72, align 4 948 %add75 = add nsw i32 %y.i, 64 949 %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75 950 store float %val, float addrspace(3)* %arrayidx76, align 4 951 %add79 = add nsw i32 %y.i, 65 952 %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79 953 store float %val, float addrspace(3)* %arrayidx80, align 4 954 ret void 955} 956 957define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 { 958; CI-LABEL: simple_write2_v4f32_superreg_align4: 959; CI: ; %bb.0: 960; CI-NEXT: s_load_dword s4, s[0:1], 0x9 961; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 962; CI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 963; CI-NEXT: s_mov_b32 m0, -1 964; CI-NEXT: s_waitcnt lgkmcnt(0) 965; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 966; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 967; CI-NEXT: s_waitcnt lgkmcnt(0) 968; CI-NEXT: v_mov_b32_e32 v1, s0 969; CI-NEXT: v_mov_b32_e32 v2, s1 970; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 971; CI-NEXT: v_mov_b32_e32 v3, s2 972; CI-NEXT: v_mov_b32_e32 v1, s3 973; CI-NEXT: ds_write2_b32 v0, v3, v1 offset0:2 offset1:3 974; CI-NEXT: s_endpgm 975; 976; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4: 977; GFX9-ALIGNED: ; %bb.0: 978; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x24 979; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 980; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 981; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4 982; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 983; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 984; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, s0 985; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, s1 986; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v3, s2 987; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v4, s3 988; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 989; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v3, v4 offset0:2 offset1:3 990; GFX9-ALIGNED-NEXT: s_endpgm 991; 992; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4: 993; GFX9-UNALIGNED: ; %bb.0: 994; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x24 995; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 996; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 997; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v4, v0, 4, s4 998; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 999; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 1000; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 1001; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 1002; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s2 1003; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s3 1004; GFX9-UNALIGNED-NEXT: ds_write_b128 v4, v[0:3] 1005; GFX9-UNALIGNED-NEXT: s_endpgm 1006 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 1007 %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in 1008 %val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4 1009 %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(3)* %out, i32 %x.i 1010 store <4 x float> %val0, <4 x float> addrspace(3)* %out.gep, align 4 1011 ret void 1012} 1013 1014@v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1 1015 1016define amdgpu_kernel void @write2_v2i32_align1_odd_offset() { 1017; CI-LABEL: write2_v2i32_align1_odd_offset: 1018; CI: ; %bb.0: ; %entry 1019; CI-NEXT: v_mov_b32_e32 v0, 0x7b 1020; CI-NEXT: v_mov_b32_e32 v1, 0 1021; CI-NEXT: s_mov_b32 m0, -1 1022; CI-NEXT: ds_write_b8 v1, v0 offset:65 1023; CI-NEXT: v_mov_b32_e32 v0, 1 1024; CI-NEXT: ds_write_b8 v1, v0 offset:70 1025; CI-NEXT: v_mov_b32_e32 v0, 0xc8 1026; CI-NEXT: ds_write_b8 v1, v0 offset:69 1027; CI-NEXT: ds_write_b8 v1, v1 offset:68 1028; CI-NEXT: ds_write_b8 v1, v1 offset:67 1029; CI-NEXT: ds_write_b8 v1, v1 offset:66 1030; CI-NEXT: ds_write_b8 v1, v1 offset:72 1031; CI-NEXT: ds_write_b8 v1, v1 offset:71 1032; CI-NEXT: s_endpgm 1033; 1034; GFX9-ALIGNED-LABEL: write2_v2i32_align1_odd_offset: 1035; GFX9-ALIGNED: ; %bb.0: ; %entry 1036; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b 1037; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, 0 1038; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:65 1039; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 1 1040; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:70 1041; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0xc8 1042; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:69 1043; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:68 1044; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:67 1045; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:66 1046; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:72 1047; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:71 1048; GFX9-ALIGNED-NEXT: s_endpgm 1049; 1050; GFX9-UNALIGNED-LABEL: write2_v2i32_align1_odd_offset: 1051; GFX9-UNALIGNED: ; %bb.0: ; %entry 1052; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x41 1053; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0x7b 1054; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x1c8 1055; GFX9-UNALIGNED-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 1056; GFX9-UNALIGNED-NEXT: s_endpgm 1057entry: 1058 store <2 x i32> <i32 123, i32 456>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1 1059 ret void 1060} 1061 1062declare i32 @llvm.amdgcn.workgroup.id.x() #1 1063declare i32 @llvm.amdgcn.workgroup.id.y() #1 1064declare i32 @llvm.amdgcn.workitem.id.x() #1 1065declare i32 @llvm.amdgcn.workitem.id.y() #1 1066 1067attributes #0 = { nounwind } 1068attributes #1 = { nounwind readnone speculatable } 1069attributes #2 = { convergent nounwind } 1070