1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update 2; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope --check-prefix=CI %s 3; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-ALIGNED %s 4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-UNALIGNED %s 5 6@lds = addrspace(3) global [512 x float] undef, align 4 7@lds.f64 = addrspace(3) global [512 x double] undef, align 8 8 9define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { 10; CI-LABEL: simple_write2_one_val_f32: 11; CI: ; %bb.0: 12; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 13; CI-NEXT: s_mov_b32 s3, 0xf000 14; CI-NEXT: s_mov_b32 s2, 0 15; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 16; CI-NEXT: v_mov_b32_e32 v1, 0 17; CI-NEXT: s_waitcnt lgkmcnt(0) 18; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 19; CI-NEXT: s_mov_b32 m0, -1 20; CI-NEXT: s_waitcnt vmcnt(0) 21; CI-NEXT: ds_write2_b32 v0, v1, v1 offset1:8 22; CI-NEXT: s_endpgm 23; 24; GFX9-LABEL: simple_write2_one_val_f32: 25; GFX9: ; %bb.0: 26; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 27; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 28; GFX9-NEXT: s_waitcnt lgkmcnt(0) 29; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 30; GFX9-NEXT: s_waitcnt vmcnt(0) 31; GFX9-NEXT: ds_write2_b32 v0, v1, v1 offset1:8 32; GFX9-NEXT: s_endpgm 33 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 34 %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i 35 %val = load float, float addrspace(1)* %in.gep, align 4 36 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 37 store float %val, float addrspace(3)* %arrayidx0, align 4 38 %add.x = add nsw i32 %x.i, 8 39 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 40 store float %val, float addrspace(3)* %arrayidx1, align 4 41 ret void 42} 43 44define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { 45; CI-LABEL: simple_write2_two_val_f32: 46; CI: ; %bb.0: 47; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 48; CI-NEXT: s_mov_b32 s3, 0xf000 49; CI-NEXT: s_mov_b32 s2, 0 50; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 51; CI-NEXT: v_mov_b32_e32 v1, 0 52; CI-NEXT: s_waitcnt lgkmcnt(0) 53; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc 54; CI-NEXT: s_waitcnt vmcnt(0) 55; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc 56; CI-NEXT: s_waitcnt vmcnt(0) 57; CI-NEXT: s_mov_b32 m0, -1 58; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:8 59; CI-NEXT: s_endpgm 60; 61; GFX9-LABEL: simple_write2_two_val_f32: 62; GFX9: ; %bb.0: 63; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 64; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 65; GFX9-NEXT: s_waitcnt lgkmcnt(0) 66; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc 67; GFX9-NEXT: s_waitcnt vmcnt(0) 68; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc 69; GFX9-NEXT: s_waitcnt vmcnt(0) 70; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8 71; GFX9-NEXT: s_endpgm 72 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 73 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i 74 %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 75 %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4 76 %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4 77 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 78 store float %val0, float addrspace(3)* %arrayidx0, align 4 79 %add.x = add nsw i32 %x.i, 8 80 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 81 store float %val1, float addrspace(3)* %arrayidx1, align 4 82 ret void 83} 84 85define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 86; CI-LABEL: simple_write2_two_val_f32_volatile_0: 87; CI: ; %bb.0: 88; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2 89; CI-NEXT: s_mov_b32 s7, 0xf000 90; CI-NEXT: s_mov_b32 s6, 0 91; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 92; CI-NEXT: v_mov_b32_e32 v1, 0 93; CI-NEXT: s_waitcnt lgkmcnt(0) 94; CI-NEXT: s_mov_b64 s[4:5], s[0:1] 95; CI-NEXT: s_mov_b64 s[0:1], s[2:3] 96; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 97; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc 98; CI-NEXT: s_waitcnt vmcnt(0) 99; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 glc 100; CI-NEXT: s_waitcnt vmcnt(0) 101; CI-NEXT: s_mov_b32 m0, -1 102; CI-NEXT: ds_write_b32 v0, v2 103; CI-NEXT: ds_write_b32 v0, v1 offset:32 104; CI-NEXT: s_endpgm 105; 106; GFX9-LABEL: simple_write2_two_val_f32_volatile_0: 107; GFX9: ; %bb.0: 108; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8 109; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 110; GFX9-NEXT: s_waitcnt lgkmcnt(0) 111; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc 112; GFX9-NEXT: s_waitcnt vmcnt(0) 113; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc 114; GFX9-NEXT: s_waitcnt vmcnt(0) 115; GFX9-NEXT: ds_write_b32 v0, v1 116; GFX9-NEXT: ds_write_b32 v0, v2 offset:32 117; GFX9-NEXT: s_endpgm 118 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 119 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 120 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 121 %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4 122 %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4 123 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 124 store volatile float %val0, float addrspace(3)* %arrayidx0, align 4 125 %add.x = add nsw i32 %x.i, 8 126 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 127 store float %val1, float addrspace(3)* %arrayidx1, align 4 128 ret void 129} 130 131define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 132; CI-LABEL: simple_write2_two_val_f32_volatile_1: 133; CI: ; %bb.0: 134; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2 135; CI-NEXT: s_mov_b32 s7, 0xf000 136; CI-NEXT: s_mov_b32 s6, 0 137; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 138; CI-NEXT: v_mov_b32_e32 v1, 0 139; CI-NEXT: s_waitcnt lgkmcnt(0) 140; CI-NEXT: s_mov_b64 s[4:5], s[0:1] 141; CI-NEXT: s_mov_b64 s[0:1], s[2:3] 142; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 143; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc 144; CI-NEXT: s_waitcnt vmcnt(0) 145; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 glc 146; CI-NEXT: s_waitcnt vmcnt(0) 147; CI-NEXT: s_mov_b32 m0, -1 148; CI-NEXT: ds_write_b32 v0, v2 149; CI-NEXT: ds_write_b32 v0, v1 offset:32 150; CI-NEXT: s_endpgm 151; 152; GFX9-LABEL: simple_write2_two_val_f32_volatile_1: 153; GFX9: ; %bb.0: 154; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8 155; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 156; GFX9-NEXT: s_waitcnt lgkmcnt(0) 157; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc 158; GFX9-NEXT: s_waitcnt vmcnt(0) 159; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc 160; GFX9-NEXT: s_waitcnt vmcnt(0) 161; GFX9-NEXT: ds_write_b32 v0, v1 162; GFX9-NEXT: ds_write_b32 v0, v2 offset:32 163; GFX9-NEXT: s_endpgm 164 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 165 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 166 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 167 %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4 168 %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4 169 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 170 store float %val0, float addrspace(3)* %arrayidx0, align 4 171 %add.x = add nsw i32 %x.i, 8 172 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 173 store volatile float %val1, float addrspace(3)* %arrayidx1, align 4 174 ret void 175} 176 177; 2 data subregisters from different super registers. 178; TODO: GFX9 has v_mov_b32_e32 v2, lds@abs32@lo 179; This should be an s_mov_b32. The v_mov_b32 gets introduced by an 180; early legalization of the constant bus constraint on the v_lshl_add_u32, 181; and then SIFoldOperands folds in an unlucky order. 182define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { 183; CI-LABEL: simple_write2_two_val_subreg2_mixed_f32: 184; CI: ; %bb.0: 185; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 186; CI-NEXT: s_mov_b32 s3, 0xf000 187; CI-NEXT: s_mov_b32 s2, 0 188; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 189; CI-NEXT: v_mov_b32_e32 v2, 0 190; CI-NEXT: s_waitcnt lgkmcnt(0) 191; CI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64 glc 192; CI-NEXT: s_waitcnt vmcnt(0) 193; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 offset:8 glc 194; CI-NEXT: s_waitcnt vmcnt(0) 195; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 196; CI-NEXT: s_mov_b32 m0, -1 197; CI-NEXT: ds_write2_b32 v0, v3, v2 offset1:8 198; CI-NEXT: s_endpgm 199; 200; GFX9-LABEL: simple_write2_two_val_subreg2_mixed_f32: 201; GFX9: ; %bb.0: 202; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 203; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 204; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 205; GFX9-NEXT: ; kill: killed $vgpr4 206; GFX9-NEXT: ; kill: killed $sgpr0_sgpr1 207; GFX9-NEXT: s_waitcnt lgkmcnt(0) 208; GFX9-NEXT: global_load_dwordx2 v[1:2], v4, s[0:1] glc 209; GFX9-NEXT: s_waitcnt vmcnt(0) 210; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc 211; GFX9-NEXT: s_waitcnt vmcnt(0) 212; GFX9-NEXT: ds_write2_b32 v0, v1, v3 offset1:8 213; GFX9-NEXT: s_endpgm 214 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 215 %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i 216 %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1 217 %val0 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8 218 %val1 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8 219 %val0.0 = extractelement <2 x float> %val0, i32 0 220 %val1.1 = extractelement <2 x float> %val1, i32 1 221 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 222 store float %val0.0, float addrspace(3)* %arrayidx0, align 4 223 %add.x = add nsw i32 %x.i, 8 224 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 225 store float %val1.1, float addrspace(3)* %arrayidx1, align 4 226 ret void 227} 228 229define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { 230; CI-LABEL: simple_write2_two_val_subreg2_f32: 231; CI: ; %bb.0: 232; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 233; CI-NEXT: s_mov_b32 s3, 0xf000 234; CI-NEXT: s_mov_b32 s2, 0 235; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 236; CI-NEXT: v_mov_b32_e32 v2, 0 237; CI-NEXT: s_waitcnt lgkmcnt(0) 238; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 239; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 240; CI-NEXT: s_mov_b32 m0, -1 241; CI-NEXT: s_waitcnt vmcnt(0) 242; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:8 243; CI-NEXT: s_endpgm 244; 245; GFX9-LABEL: simple_write2_two_val_subreg2_f32: 246; GFX9: ; %bb.0: 247; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 248; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v0 249; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 250; GFX9-NEXT: s_waitcnt lgkmcnt(0) 251; GFX9-NEXT: global_load_dwordx2 v[1:2], v1, s[0:1] 252; GFX9-NEXT: s_waitcnt vmcnt(0) 253; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8 254; GFX9-NEXT: s_endpgm 255 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 256 %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i 257 %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8 258 %val0 = extractelement <2 x float> %val, i32 0 259 %val1 = extractelement <2 x float> %val, i32 1 260 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 261 store float %val0, float addrspace(3)* %arrayidx0, align 4 262 %add.x = add nsw i32 %x.i, 8 263 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 264 store float %val1, float addrspace(3)* %arrayidx1, align 4 265 ret void 266} 267 268define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 { 269; CI-LABEL: simple_write2_two_val_subreg4_f32: 270; CI: ; %bb.0: 271; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 272; CI-NEXT: s_mov_b32 s3, 0xf000 273; CI-NEXT: s_mov_b32 s2, 0 274; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v0 275; CI-NEXT: v_mov_b32_e32 v2, 0 276; CI-NEXT: s_waitcnt lgkmcnt(0) 277; CI-NEXT: buffer_load_dwordx4 v[1:4], v[1:2], s[0:3], 0 addr64 278; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 279; CI-NEXT: s_mov_b32 m0, -1 280; CI-NEXT: s_waitcnt vmcnt(0) 281; CI-NEXT: ds_write2_b32 v0, v1, v4 offset1:8 282; CI-NEXT: s_endpgm 283; 284; GFX9-LABEL: simple_write2_two_val_subreg4_f32: 285; GFX9: ; %bb.0: 286; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 287; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v0 288; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 289; GFX9-NEXT: s_waitcnt lgkmcnt(0) 290; GFX9-NEXT: global_load_dwordx4 v[1:4], v1, s[0:1] 291; GFX9-NEXT: s_waitcnt vmcnt(0) 292; GFX9-NEXT: ds_write2_b32 v0, v1, v4 offset1:8 293; GFX9-NEXT: s_endpgm 294 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 295 %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i 296 %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16 297 %val0 = extractelement <4 x float> %val, i32 0 298 %val1 = extractelement <4 x float> %val, i32 3 299 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 300 store float %val0, float addrspace(3)* %arrayidx0, align 4 301 %add.x = add nsw i32 %x.i, 8 302 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 303 store float %val1, float addrspace(3)* %arrayidx1, align 4 304 ret void 305} 306 307define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { 308; CI-LABEL: simple_write2_two_val_max_offset_f32: 309; CI: ; %bb.0: 310; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 311; CI-NEXT: s_mov_b32 s3, 0xf000 312; CI-NEXT: s_mov_b32 s2, 0 313; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 314; CI-NEXT: v_mov_b32_e32 v1, 0 315; CI-NEXT: s_waitcnt lgkmcnt(0) 316; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc 317; CI-NEXT: s_waitcnt vmcnt(0) 318; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc 319; CI-NEXT: s_waitcnt vmcnt(0) 320; CI-NEXT: s_mov_b32 m0, -1 321; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:255 322; CI-NEXT: s_endpgm 323; 324; GFX9-LABEL: simple_write2_two_val_max_offset_f32: 325; GFX9: ; %bb.0: 326; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 327; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 328; GFX9-NEXT: s_waitcnt lgkmcnt(0) 329; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc 330; GFX9-NEXT: s_waitcnt vmcnt(0) 331; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc 332; GFX9-NEXT: s_waitcnt vmcnt(0) 333; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:255 334; GFX9-NEXT: s_endpgm 335 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 336 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i 337 %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 338 %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4 339 %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4 340 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 341 store float %val0, float addrspace(3)* %arrayidx0, align 4 342 %add.x = add nsw i32 %x.i, 255 343 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 344 store float %val1, float addrspace(3)* %arrayidx1, align 4 345 ret void 346} 347 348define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 349; CI-LABEL: simple_write2_two_val_too_far_f32: 350; CI: ; %bb.0: 351; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2 352; CI-NEXT: s_mov_b32 s7, 0xf000 353; CI-NEXT: s_mov_b32 s6, 0 354; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 355; CI-NEXT: v_mov_b32_e32 v1, 0 356; CI-NEXT: s_waitcnt lgkmcnt(0) 357; CI-NEXT: s_mov_b64 s[4:5], s[0:1] 358; CI-NEXT: s_mov_b64 s[0:1], s[2:3] 359; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 360; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 361; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 362; CI-NEXT: s_mov_b32 m0, -1 363; CI-NEXT: s_waitcnt vmcnt(1) 364; CI-NEXT: ds_write_b32 v0, v2 365; CI-NEXT: s_waitcnt vmcnt(0) 366; CI-NEXT: ds_write_b32 v0, v1 offset:1028 367; CI-NEXT: s_endpgm 368; 369; GFX9-LABEL: simple_write2_two_val_too_far_f32: 370; GFX9: ; %bb.0: 371; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8 372; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 373; GFX9-NEXT: s_waitcnt lgkmcnt(0) 374; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 375; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 376; GFX9-NEXT: s_waitcnt vmcnt(1) 377; GFX9-NEXT: ds_write_b32 v0, v1 378; GFX9-NEXT: s_waitcnt vmcnt(0) 379; GFX9-NEXT: ds_write_b32 v0, v2 offset:1028 380; GFX9-NEXT: s_endpgm 381 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 382 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 383 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 384 %val0 = load float, float addrspace(1)* %in0.gep, align 4 385 %val1 = load float, float addrspace(1)* %in1.gep, align 4 386 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 387 store float %val0, float addrspace(3)* %arrayidx0, align 4 388 %add.x = add nsw i32 %x.i, 257 389 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 390 store float %val1, float addrspace(3)* %arrayidx1, align 4 391 ret void 392} 393 394define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 395; CI-LABEL: simple_write2_two_val_f32_x2: 396; CI: ; %bb.0: 397; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2 398; CI-NEXT: s_mov_b32 s7, 0xf000 399; CI-NEXT: s_mov_b32 s6, 0 400; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 401; CI-NEXT: v_mov_b32_e32 v1, 0 402; CI-NEXT: s_waitcnt lgkmcnt(0) 403; CI-NEXT: s_mov_b64 s[4:5], s[0:1] 404; CI-NEXT: s_mov_b64 s[0:1], s[2:3] 405; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 406; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 407; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 408; CI-NEXT: s_mov_b32 m0, -1 409; CI-NEXT: s_waitcnt vmcnt(0) 410; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:8 411; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:11 offset1:27 412; CI-NEXT: s_endpgm 413; 414; GFX9-LABEL: simple_write2_two_val_f32_x2: 415; GFX9: ; %bb.0: 416; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8 417; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 418; GFX9-NEXT: s_waitcnt lgkmcnt(0) 419; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 420; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 421; GFX9-NEXT: s_waitcnt vmcnt(0) 422; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8 423; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27 424; GFX9-NEXT: s_endpgm 425 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 426 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x 427 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x 428 %val0 = load float, float addrspace(1)* %in0.gep, align 4 429 %val1 = load float, float addrspace(1)* %in1.gep, align 4 430 431 %idx.0 = add nsw i32 %tid.x, 0 432 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 433 store float %val0, float addrspace(3)* %arrayidx0, align 4 434 435 %idx.1 = add nsw i32 %tid.x, 8 436 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 437 store float %val1, float addrspace(3)* %arrayidx1, align 4 438 439 %idx.2 = add nsw i32 %tid.x, 11 440 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 441 store float %val0, float addrspace(3)* %arrayidx2, align 4 442 443 %idx.3 = add nsw i32 %tid.x, 27 444 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 445 store float %val1, float addrspace(3)* %arrayidx3, align 4 446 447 ret void 448} 449 450define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 451; CI-LABEL: simple_write2_two_val_f32_x2_nonzero_base: 452; CI: ; %bb.0: 453; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2 454; CI-NEXT: s_mov_b32 s7, 0xf000 455; CI-NEXT: s_mov_b32 s6, 0 456; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 457; CI-NEXT: v_mov_b32_e32 v1, 0 458; CI-NEXT: s_waitcnt lgkmcnt(0) 459; CI-NEXT: s_mov_b64 s[4:5], s[0:1] 460; CI-NEXT: s_mov_b64 s[0:1], s[2:3] 461; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 462; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 463; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 464; CI-NEXT: s_mov_b32 m0, -1 465; CI-NEXT: s_waitcnt vmcnt(0) 466; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:3 offset1:8 467; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:11 offset1:27 468; CI-NEXT: s_endpgm 469; 470; GFX9-LABEL: simple_write2_two_val_f32_x2_nonzero_base: 471; GFX9: ; %bb.0: 472; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8 473; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 474; GFX9-NEXT: s_waitcnt lgkmcnt(0) 475; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 476; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 477; GFX9-NEXT: s_waitcnt vmcnt(0) 478; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:8 479; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27 480; GFX9-NEXT: s_endpgm 481 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 482 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x 483 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x 484 %val0 = load float, float addrspace(1)* %in0.gep, align 4 485 %val1 = load float, float addrspace(1)* %in1.gep, align 4 486 487 %idx.0 = add nsw i32 %tid.x, 3 488 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 489 store float %val0, float addrspace(3)* %arrayidx0, align 4 490 491 %idx.1 = add nsw i32 %tid.x, 8 492 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 493 store float %val1, float addrspace(3)* %arrayidx1, align 4 494 495 %idx.2 = add nsw i32 %tid.x, 11 496 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 497 store float %val0, float addrspace(3)* %arrayidx2, align 4 498 499 %idx.3 = add nsw i32 %tid.x, 27 500 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 501 store float %val1, float addrspace(3)* %arrayidx3, align 4 502 503 ret void 504} 505 506define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 { 507; CI-LABEL: write2_ptr_subreg_arg_two_val_f32: 508; CI: ; %bb.0: 509; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2 510; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x6 511; CI-NEXT: s_mov_b32 s3, 0xf000 512; CI-NEXT: s_mov_b32 s2, 0 513; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 514; CI-NEXT: s_waitcnt lgkmcnt(0) 515; CI-NEXT: s_mov_b64 s[0:1], s[4:5] 516; CI-NEXT: v_mov_b32_e32 v1, 0 517; CI-NEXT: s_mov_b64 s[4:5], s[6:7] 518; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 519; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 520; CI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 521; CI-NEXT: v_mov_b32_e32 v1, s8 522; CI-NEXT: s_mov_b32 m0, -1 523; CI-NEXT: v_mov_b32_e32 v3, s9 524; CI-NEXT: s_waitcnt vmcnt(1) 525; CI-NEXT: ds_write_b32 v1, v2 offset:32 526; CI-NEXT: s_waitcnt vmcnt(0) 527; CI-NEXT: ds_write_b32 v3, v0 offset:32 528; CI-NEXT: s_endpgm 529; 530; GFX9-LABEL: write2_ptr_subreg_arg_two_val_f32: 531; GFX9: ; %bb.0: 532; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 533; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x18 534; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 535; GFX9-NEXT: s_waitcnt lgkmcnt(0) 536; GFX9-NEXT: global_load_dword v1, v0, s[4:5] 537; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 538; GFX9-NEXT: v_mov_b32_e32 v0, s2 539; GFX9-NEXT: v_mov_b32_e32 v3, s3 540; GFX9-NEXT: s_waitcnt vmcnt(1) 541; GFX9-NEXT: ds_write_b32 v0, v1 offset:32 542; GFX9-NEXT: s_waitcnt vmcnt(0) 543; GFX9-NEXT: ds_write_b32 v3, v2 offset:32 544; GFX9-NEXT: s_endpgm 545 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 546 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 547 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 548 %val0 = load float, float addrspace(1)* %in0.gep, align 4 549 %val1 = load float, float addrspace(1)* %in1.gep, align 4 550 551 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 552 %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 553 %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1 554 %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 555 %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 556 557 ; Apply an additional offset after the vector that will be more obviously folded. 558 %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8 559 store float %val0, float addrspace(3)* %gep.0, align 4 560 561 %add.x = add nsw i32 %x.i, 8 562 store float %val1, float addrspace(3)* %gep.1.offset, align 4 563 ret void 564} 565 566define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { 567; CI-LABEL: simple_write2_one_val_f64: 568; CI: ; %bb.0: 569; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 570; CI-NEXT: s_mov_b32 s3, 0xf000 571; CI-NEXT: s_mov_b32 s2, 0 572; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 573; CI-NEXT: v_mov_b32_e32 v1, 0 574; CI-NEXT: s_waitcnt lgkmcnt(0) 575; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64 576; CI-NEXT: s_mov_b32 m0, -1 577; CI-NEXT: s_waitcnt vmcnt(0) 578; CI-NEXT: ds_write2_b64 v0, v[1:2], v[1:2] offset1:8 579; CI-NEXT: s_endpgm 580; 581; GFX9-LABEL: simple_write2_one_val_f64: 582; GFX9: ; %bb.0: 583; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 584; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 585; GFX9-NEXT: s_waitcnt lgkmcnt(0) 586; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] 587; GFX9-NEXT: s_waitcnt vmcnt(0) 588; GFX9-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:8 589; GFX9-NEXT: s_endpgm 590 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 591 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i 592 %val = load double, double addrspace(1)* %in.gep, align 8 593 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i 594 store double %val, double addrspace(3)* %arrayidx0, align 8 595 %add.x = add nsw i32 %x.i, 8 596 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x 597 store double %val, double addrspace(3)* %arrayidx1, align 8 598 ret void 599} 600 601define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { 602; CI-LABEL: misaligned_simple_write2_one_val_f64: 603; CI: ; %bb.0: 604; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 605; CI-NEXT: s_load_dword s0, s[0:1], 0x4 606; CI-NEXT: s_mov_b32 s7, 0xf000 607; CI-NEXT: s_mov_b32 s6, 0 608; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 609; CI-NEXT: v_mov_b32_e32 v1, 0 610; CI-NEXT: s_waitcnt lgkmcnt(0) 611; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64 612; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 613; CI-NEXT: s_mov_b32 m0, -1 614; CI-NEXT: s_waitcnt vmcnt(0) 615; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 616; CI-NEXT: ds_write2_b32 v0, v1, v2 offset0:14 offset1:15 617; CI-NEXT: s_endpgm 618; 619; GFX9-LABEL: misaligned_simple_write2_one_val_f64: 620; GFX9: ; %bb.0: 621; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 622; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10 623; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 624; GFX9-NEXT: s_waitcnt lgkmcnt(0) 625; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 626; GFX9-NEXT: v_add_u32_e32 v2, s4, v2 627; GFX9-NEXT: s_waitcnt vmcnt(0) 628; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 629; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset0:14 offset1:15 630; GFX9-NEXT: s_endpgm 631 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 632 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i 633 %val = load double, double addrspace(1)* %in.gep, align 8 634 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i 635 store double %val, double addrspace(3)* %arrayidx0, align 4 636 %add.x = add nsw i32 %x.i, 7 637 %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x 638 store double %val, double addrspace(3)* %arrayidx1, align 4 639 ret void 640} 641 642define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { 643; CI-LABEL: unaligned_offset_simple_write2_one_val_f64: 644; CI: ; %bb.0: 645; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 646; CI-NEXT: s_load_dword s0, s[0:1], 0x4 647; CI-NEXT: s_mov_b32 s7, 0xf000 648; CI-NEXT: s_mov_b32 s6, 0 649; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 650; CI-NEXT: v_mov_b32_e32 v1, 0 651; CI-NEXT: s_waitcnt lgkmcnt(0) 652; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64 653; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 654; CI-NEXT: s_mov_b32 m0, -1 655; CI-NEXT: s_waitcnt vmcnt(0) 656; CI-NEXT: v_lshrrev_b32_e32 v3, 24, v1 657; CI-NEXT: ds_write_b8 v0, v1 offset:5 658; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 659; CI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 660; CI-NEXT: ds_write_b8 v0, v2 offset:13 661; CI-NEXT: ds_write_b8 v0, v1 offset:9 662; CI-NEXT: v_lshrrev_b32_e32 v1, 24, v2 663; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 664; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 665; CI-NEXT: ds_write_b8 v0, v3 offset:8 666; CI-NEXT: ds_write_b8 v0, v4 offset:7 667; CI-NEXT: ds_write_b8 v0, v5 offset:6 668; CI-NEXT: ds_write_b8 v0, v1 offset:16 669; CI-NEXT: ds_write_b8 v0, v6 offset:15 670; CI-NEXT: ds_write_b8 v0, v2 offset:14 671; CI-NEXT: ds_write_b8 v0, v3 offset:12 672; CI-NEXT: ds_write_b8 v0, v4 offset:11 673; CI-NEXT: ds_write_b8 v0, v5 offset:10 674; CI-NEXT: s_endpgm 675; 676; GFX9-ALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: 677; GFX9-ALIGNED: ; %bb.0: 678; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 679; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x10 680; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 681; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 682; GFX9-ALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 683; GFX9-ALIGNED-NEXT: v_add_u32_e32 v2, s4, v2 684; GFX9-ALIGNED-NEXT: s_waitcnt vmcnt(0) 685; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:7 686; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:5 687; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v3, 24, v0 688; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v0 689; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v1 offset:15 690; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:13 691; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:11 692; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:9 693; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v0, 24, v1 694; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v1, 8, v1 695; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v3 offset:8 696; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v4 offset:6 697; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:16 698; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:14 699; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v3 offset:12 700; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v4 offset:10 701; GFX9-ALIGNED-NEXT: s_endpgm 702; 703; GFX9-UNALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: 704; GFX9-UNALIGNED: ; %bb.0: 705; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 706; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x10 707; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 708; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 709; GFX9-UNALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 710; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, s4, v2 711; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) 712; GFX9-UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] offset:5 713; GFX9-UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] offset:9 714; GFX9-UNALIGNED-NEXT: s_endpgm 715 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 716 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i 717 %val = load double, double addrspace(1)* %in.gep, align 8 718 %base = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i 719 %base.i8 = bitcast double addrspace(3)* %base to i8 addrspace(3)* 720 %addr0.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 5 721 %addr0 = bitcast i8 addrspace(3)* %addr0.i8 to double addrspace(3)* 722 store double %val, double addrspace(3)* %addr0, align 1 723 %addr1.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 9 724 %addr1 = bitcast i8 addrspace(3)* %addr1.i8 to double addrspace(3)* 725 store double %val, double addrspace(3)* %addr1, align 1 726 ret void 727} 728 729define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { 730; CI-LABEL: simple_write2_two_val_f64: 731; CI: ; %bb.0: 732; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 733; CI-NEXT: s_mov_b32 s3, 0xf000 734; CI-NEXT: s_mov_b32 s2, 0 735; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 736; CI-NEXT: v_mov_b32_e32 v1, 0 737; CI-NEXT: s_waitcnt lgkmcnt(0) 738; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc 739; CI-NEXT: s_waitcnt vmcnt(0) 740; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 glc 741; CI-NEXT: s_waitcnt vmcnt(0) 742; CI-NEXT: s_mov_b32 m0, -1 743; CI-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:8 744; CI-NEXT: s_endpgm 745; 746; GFX9-LABEL: simple_write2_two_val_f64: 747; GFX9: ; %bb.0: 748; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 749; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 750; GFX9-NEXT: s_waitcnt lgkmcnt(0) 751; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc 752; GFX9-NEXT: s_waitcnt vmcnt(0) 753; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc 754; GFX9-NEXT: s_waitcnt vmcnt(0) 755; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:8 756; GFX9-NEXT: s_endpgm 757 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 758 %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i 759 %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1 760 %val0 = load volatile double, double addrspace(1)* %in.gep.0, align 8 761 %val1 = load volatile double, double addrspace(1)* %in.gep.1, align 8 762 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i 763 store double %val0, double addrspace(3)* %arrayidx0, align 8 764 %add.x = add nsw i32 %x.i, 8 765 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x 766 store double %val1, double addrspace(3)* %arrayidx1, align 8 767 ret void 768} 769 770@foo = addrspace(3) global [4 x i32] undef, align 4 771 772define amdgpu_kernel void @store_constant_adjacent_offsets() { 773; CI-LABEL: store_constant_adjacent_offsets: 774; CI: ; %bb.0: 775; CI-NEXT: v_mov_b32_e32 v0, 0x7b 776; CI-NEXT: v_mov_b32_e32 v1, v0 777; CI-NEXT: v_mov_b32_e32 v2, 0 778; CI-NEXT: s_mov_b32 m0, -1 779; CI-NEXT: ds_write_b64 v2, v[0:1] 780; CI-NEXT: s_endpgm 781; 782; GFX9-LABEL: store_constant_adjacent_offsets: 783; GFX9: ; %bb.0: 784; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b 785; GFX9-NEXT: v_mov_b32_e32 v1, v0 786; GFX9-NEXT: v_mov_b32_e32 v2, 0 787; GFX9-NEXT: ds_write_b64 v2, v[0:1] 788; GFX9-NEXT: s_endpgm 789 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 790 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 791 ret void 792} 793 794define amdgpu_kernel void @store_constant_disjoint_offsets() { 795; CI-LABEL: store_constant_disjoint_offsets: 796; CI: ; %bb.0: 797; CI-NEXT: v_mov_b32_e32 v0, 0x7b 798; CI-NEXT: v_mov_b32_e32 v1, 0 799; CI-NEXT: s_mov_b32 m0, -1 800; CI-NEXT: ds_write2_b32 v1, v0, v0 offset1:2 801; CI-NEXT: s_endpgm 802; 803; GFX9-LABEL: store_constant_disjoint_offsets: 804; GFX9: ; %bb.0: 805; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b 806; GFX9-NEXT: v_mov_b32_e32 v1, 0 807; GFX9-NEXT: ds_write2_b32 v1, v0, v0 offset1:2 808; GFX9-NEXT: s_endpgm 809 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 810 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 811 ret void 812} 813 814@bar = addrspace(3) global [4 x i64] undef, align 4 815 816define amdgpu_kernel void @store_misaligned64_constant_offsets() { 817; CI-LABEL: store_misaligned64_constant_offsets: 818; CI: ; %bb.0: 819; CI-NEXT: v_mov_b32_e32 v0, 0x7b 820; CI-NEXT: v_mov_b32_e32 v1, 0 821; CI-NEXT: v_mov_b32_e32 v2, v0 822; CI-NEXT: v_mov_b32_e32 v3, v1 823; CI-NEXT: s_mov_b32 m0, -1 824; CI-NEXT: ds_write_b128 v1, v[0:3] 825; CI-NEXT: s_endpgm 826; 827; GFX9-LABEL: store_misaligned64_constant_offsets: 828; GFX9: ; %bb.0: 829; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b 830; GFX9-NEXT: v_mov_b32_e32 v1, 0 831; GFX9-NEXT: v_mov_b32_e32 v2, v0 832; GFX9-NEXT: v_mov_b32_e32 v3, v1 833; GFX9-NEXT: ds_write_b128 v1, v[0:3] 834; GFX9-NEXT: s_endpgm 835 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 836 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 837 ret void 838} 839 840@bar.large = addrspace(3) global [4096 x i64] undef, align 4 841 842define amdgpu_kernel void @store_misaligned64_constant_large_offsets() { 843; CI-LABEL: store_misaligned64_constant_large_offsets: 844; CI: ; %bb.0: 845; CI-NEXT: s_mov_b64 s[0:1], 0x7b 846; CI-NEXT: v_mov_b32_e32 v0, s0 847; CI-NEXT: v_mov_b32_e32 v2, 0 848; CI-NEXT: v_mov_b32_e32 v1, s1 849; CI-NEXT: s_mov_b32 m0, -1 850; CI-NEXT: ds_write_b64 v2, v[0:1] offset:16384 851; CI-NEXT: ds_write_b64 v2, v[0:1] offset:32760 852; CI-NEXT: s_endpgm 853; 854; GFX9-LABEL: store_misaligned64_constant_large_offsets: 855; GFX9: ; %bb.0: 856; GFX9-NEXT: s_mov_b64 s[0:1], 0x7b 857; GFX9-NEXT: v_mov_b32_e32 v0, s0 858; GFX9-NEXT: v_mov_b32_e32 v2, 0 859; GFX9-NEXT: v_mov_b32_e32 v1, s1 860; GFX9-NEXT: ds_write_b64 v2, v[0:1] offset:16384 861; GFX9-NEXT: ds_write_b64 v2, v[0:1] offset:32760 862; GFX9-NEXT: s_endpgm 863 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 864 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 865 ret void 866} 867 868@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4 869@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4 870 871define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 { 872; CI-LABEL: write2_sgemm_sequence: 873; CI: ; %bb.0: 874; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4 875; CI-NEXT: s_mov_b32 m0, -1 876; CI-NEXT: s_waitcnt lgkmcnt(0) 877; CI-NEXT: s_load_dword s0, s[0:1], 0x0 878; CI-NEXT: s_lshl_b32 s1, s2, 2 879; CI-NEXT: s_add_i32 s2, s1, 0xc20 880; CI-NEXT: s_addk_i32 s1, 0xc60 881; CI-NEXT: v_mov_b32_e32 v0, s2 882; CI-NEXT: s_waitcnt lgkmcnt(0) 883; CI-NEXT: v_mov_b32_e32 v2, s0 884; CI-NEXT: v_mov_b32_e32 v3, s0 885; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 886; CI-NEXT: v_mov_b32_e32 v0, s1 887; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 888; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v1 889; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 890; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:32 offset1:33 891; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:64 offset1:65 892; CI-NEXT: s_endpgm 893; 894; GFX9-LABEL: write2_sgemm_sequence: 895; GFX9: ; %bb.0: 896; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x10 897; GFX9-NEXT: s_lshl_b32 s2, s2, 2 898; GFX9-NEXT: s_waitcnt lgkmcnt(0) 899; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 900; GFX9-NEXT: s_add_i32 s1, s2, 0xc20 901; GFX9-NEXT: s_addk_i32 s2, 0xc60 902; GFX9-NEXT: v_mov_b32_e32 v0, s1 903; GFX9-NEXT: v_mov_b32_e32 v2, s2 904; GFX9-NEXT: s_waitcnt lgkmcnt(0) 905; GFX9-NEXT: v_mov_b32_e32 v3, s0 906; GFX9-NEXT: v_mov_b32_e32 v4, s0 907; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset1:1 908; GFX9-NEXT: ds_write2_b32 v2, v3, v4 offset1:1 909; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v1 910; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset1:1 911; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset0:32 offset1:33 912; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset0:64 offset1:65 913; GFX9-NEXT: s_endpgm 914 %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1 915 %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1 916 %val = load float, float addrspace(1)* %in 917 %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i 918 store float %val, float addrspace(3)* %arrayidx44, align 4 919 %add47 = add nsw i32 %x.i, 1 920 %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47 921 store float %val, float addrspace(3)* %arrayidx48, align 4 922 %add51 = add nsw i32 %x.i, 16 923 %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51 924 store float %val, float addrspace(3)* %arrayidx52, align 4 925 %add55 = add nsw i32 %x.i, 17 926 %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55 927 store float %val, float addrspace(3)* %arrayidx56, align 4 928 %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i 929 store float %val, float addrspace(3)* %arrayidx60, align 4 930 %add63 = add nsw i32 %y.i, 1 931 %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63 932 store float %val, float addrspace(3)* %arrayidx64, align 4 933 %add67 = add nsw i32 %y.i, 32 934 %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67 935 store float %val, float addrspace(3)* %arrayidx68, align 4 936 %add71 = add nsw i32 %y.i, 33 937 %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71 938 store float %val, float addrspace(3)* %arrayidx72, align 4 939 %add75 = add nsw i32 %y.i, 64 940 %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75 941 store float %val, float addrspace(3)* %arrayidx76, align 4 942 %add79 = add nsw i32 %y.i, 65 943 %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79 944 store float %val, float addrspace(3)* %arrayidx80, align 4 945 ret void 946} 947 948define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 { 949; CI-LABEL: simple_write2_v4f32_superreg_align4: 950; CI: ; %bb.0: 951; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 952; CI-NEXT: s_load_dword s4, s[0:1], 0x0 953; CI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 954; CI-NEXT: s_mov_b32 m0, -1 955; CI-NEXT: s_waitcnt lgkmcnt(0) 956; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 957; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 958; CI-NEXT: s_waitcnt lgkmcnt(0) 959; CI-NEXT: v_mov_b32_e32 v1, s0 960; CI-NEXT: v_mov_b32_e32 v2, s1 961; CI-NEXT: v_mov_b32_e32 v3, s2 962; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 963; CI-NEXT: v_mov_b32_e32 v1, s3 964; CI-NEXT: ds_write2_b32 v0, v3, v1 offset0:2 offset1:3 965; CI-NEXT: s_endpgm 966; 967; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4: 968; GFX9-ALIGNED: ; %bb.0: 969; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 970; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x0 971; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 972; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4 973; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 974; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 975; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, s0 976; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, s1 977; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v3, s2 978; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v4, s3 979; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 980; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v3, v4 offset0:2 offset1:3 981; GFX9-ALIGNED-NEXT: s_endpgm 982; 983; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4: 984; GFX9-UNALIGNED: ; %bb.0: 985; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 986; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x0 987; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 988; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4 989; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 990; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 991; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s2 992; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s3 993; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s0 994; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 995; GFX9-UNALIGNED-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3 996; GFX9-UNALIGNED-NEXT: ds_write2_b32 v0, v3, v4 offset1:1 997; GFX9-UNALIGNED-NEXT: s_endpgm 998 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 999 %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in 1000 %val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4 1001 %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(3)* %out, i32 %x.i 1002 store <4 x float> %val0, <4 x float> addrspace(3)* %out.gep, align 4 1003 ret void 1004} 1005 1006@v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1 1007 1008define amdgpu_kernel void @write2_v2i32_align1_odd_offset() { 1009; CI-LABEL: write2_v2i32_align1_odd_offset: 1010; CI: ; %bb.0: ; %entry 1011; CI-NEXT: v_mov_b32_e32 v0, 0x7b 1012; CI-NEXT: v_mov_b32_e32 v1, 0 1013; CI-NEXT: s_mov_b32 m0, -1 1014; CI-NEXT: ds_write_b8 v1, v0 offset:65 1015; CI-NEXT: v_mov_b32_e32 v0, 1 1016; CI-NEXT: ds_write_b8 v1, v0 offset:70 1017; CI-NEXT: v_mov_b32_e32 v0, 0xc8 1018; CI-NEXT: ds_write_b8 v1, v0 offset:69 1019; CI-NEXT: ds_write_b8 v1, v1 offset:68 1020; CI-NEXT: ds_write_b8 v1, v1 offset:67 1021; CI-NEXT: ds_write_b8 v1, v1 offset:66 1022; CI-NEXT: ds_write_b8 v1, v1 offset:72 1023; CI-NEXT: ds_write_b8 v1, v1 offset:71 1024; CI-NEXT: s_endpgm 1025; 1026; GFX9-ALIGNED-LABEL: write2_v2i32_align1_odd_offset: 1027; GFX9-ALIGNED: ; %bb.0: ; %entry 1028; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b 1029; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, 0 1030; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:65 1031; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 1 1032; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:70 1033; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0xc8 1034; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:69 1035; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:68 1036; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:67 1037; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:66 1038; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:72 1039; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:71 1040; GFX9-ALIGNED-NEXT: s_endpgm 1041; 1042; GFX9-UNALIGNED-LABEL: write2_v2i32_align1_odd_offset: 1043; GFX9-UNALIGNED: ; %bb.0: ; %entry 1044; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b 1045; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0x1c8 1046; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0 1047; GFX9-UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] offset:65 1048; GFX9-UNALIGNED-NEXT: s_endpgm 1049entry: 1050 store <2 x i32> <i32 123, i32 456>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1 1051 ret void 1052} 1053 1054declare i32 @llvm.amdgcn.workgroup.id.x() #1 1055declare i32 @llvm.amdgcn.workgroup.id.y() #1 1056declare i32 @llvm.amdgcn.workitem.id.x() #1 1057declare i32 @llvm.amdgcn.workitem.id.y() #1 1058 1059attributes #0 = { nounwind } 1060attributes #1 = { nounwind readnone speculatable } 1061attributes #2 = { convergent nounwind } 1062