1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update 2; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope --check-prefix=CI %s 3; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-ALIGNED %s 4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-UNALIGNED %s 5 6@lds = addrspace(3) global [512 x float] undef, align 4 7@lds.f64 = addrspace(3) global [512 x double] undef, align 8 8 9define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { 10; CI-LABEL: simple_write2_one_val_f32: 11; CI: ; %bb.0: 12; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 13; CI-NEXT: s_mov_b32 s3, 0xf000 14; CI-NEXT: s_mov_b32 s2, 0 15; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 16; CI-NEXT: v_mov_b32_e32 v1, 0 17; CI-NEXT: s_waitcnt lgkmcnt(0) 18; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 19; CI-NEXT: s_mov_b32 m0, -1 20; CI-NEXT: s_waitcnt vmcnt(0) 21; CI-NEXT: ds_write2_b32 v0, v1, v1 offset1:8 22; CI-NEXT: s_endpgm 23; 24; GFX9-LABEL: simple_write2_one_val_f32: 25; GFX9: ; %bb.0: 26; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 27; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 28; GFX9-NEXT: s_waitcnt lgkmcnt(0) 29; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 30; GFX9-NEXT: s_waitcnt vmcnt(0) 31; GFX9-NEXT: ds_write2_b32 v0, v1, v1 offset1:8 32; GFX9-NEXT: s_endpgm 33 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 34 %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i 35 %val = load float, float addrspace(1)* %in.gep, align 4 36 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 37 store float %val, float addrspace(3)* %arrayidx0, align 4 38 %add.x = add nsw i32 %x.i, 8 39 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 40 store float %val, float addrspace(3)* %arrayidx1, align 4 41 ret void 42} 43 44define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { 45; CI-LABEL: simple_write2_two_val_f32: 46; CI: ; %bb.0: 47; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 48; CI-NEXT: s_mov_b32 s3, 0xf000 49; CI-NEXT: s_mov_b32 s2, 0 50; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 51; CI-NEXT: v_mov_b32_e32 v1, 0 52; CI-NEXT: s_waitcnt lgkmcnt(0) 53; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc 54; CI-NEXT: s_waitcnt vmcnt(0) 55; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc 56; CI-NEXT: s_waitcnt vmcnt(0) 57; CI-NEXT: s_mov_b32 m0, -1 58; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:8 59; CI-NEXT: s_endpgm 60; 61; GFX9-LABEL: simple_write2_two_val_f32: 62; GFX9: ; %bb.0: 63; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 64; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 65; GFX9-NEXT: s_waitcnt lgkmcnt(0) 66; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc 67; GFX9-NEXT: s_waitcnt vmcnt(0) 68; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc 69; GFX9-NEXT: s_waitcnt vmcnt(0) 70; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8 71; GFX9-NEXT: s_endpgm 72 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 73 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i 74 %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 75 %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4 76 %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4 77 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 78 store float %val0, float addrspace(3)* %arrayidx0, align 4 79 %add.x = add nsw i32 %x.i, 8 80 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 81 store float %val1, float addrspace(3)* %arrayidx1, align 4 82 ret void 83} 84 85define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 86; CI-LABEL: simple_write2_two_val_f32_volatile_0: 87; CI: ; %bb.0: 88; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 89; CI-NEXT: s_mov_b32 s7, 0xf000 90; CI-NEXT: s_mov_b32 s6, 0 91; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 92; CI-NEXT: v_mov_b32_e32 v1, 0 93; CI-NEXT: s_waitcnt lgkmcnt(0) 94; CI-NEXT: s_mov_b64 s[4:5], s[0:1] 95; CI-NEXT: s_mov_b64 s[0:1], s[2:3] 96; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 97; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc 98; CI-NEXT: s_waitcnt vmcnt(0) 99; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 glc 100; CI-NEXT: s_waitcnt vmcnt(0) 101; CI-NEXT: s_mov_b32 m0, -1 102; CI-NEXT: ds_write_b32 v0, v2 103; CI-NEXT: ds_write_b32 v0, v1 offset:32 104; CI-NEXT: s_endpgm 105; 106; GFX9-LABEL: simple_write2_two_val_f32_volatile_0: 107; GFX9: ; %bb.0: 108; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 109; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 110; GFX9-NEXT: s_waitcnt lgkmcnt(0) 111; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc 112; GFX9-NEXT: s_waitcnt vmcnt(0) 113; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc 114; GFX9-NEXT: s_waitcnt vmcnt(0) 115; GFX9-NEXT: ds_write_b32 v0, v1 116; GFX9-NEXT: ds_write_b32 v0, v2 offset:32 117; GFX9-NEXT: s_endpgm 118 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 119 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 120 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 121 %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4 122 %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4 123 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 124 store volatile float %val0, float addrspace(3)* %arrayidx0, align 4 125 %add.x = add nsw i32 %x.i, 8 126 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 127 store float %val1, float addrspace(3)* %arrayidx1, align 4 128 ret void 129} 130 131define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 132; CI-LABEL: simple_write2_two_val_f32_volatile_1: 133; CI: ; %bb.0: 134; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 135; CI-NEXT: s_mov_b32 s7, 0xf000 136; CI-NEXT: s_mov_b32 s6, 0 137; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 138; CI-NEXT: v_mov_b32_e32 v1, 0 139; CI-NEXT: s_waitcnt lgkmcnt(0) 140; CI-NEXT: s_mov_b64 s[4:5], s[0:1] 141; CI-NEXT: s_mov_b64 s[0:1], s[2:3] 142; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 143; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc 144; CI-NEXT: s_waitcnt vmcnt(0) 145; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 glc 146; CI-NEXT: s_waitcnt vmcnt(0) 147; CI-NEXT: s_mov_b32 m0, -1 148; CI-NEXT: ds_write_b32 v0, v2 149; CI-NEXT: ds_write_b32 v0, v1 offset:32 150; CI-NEXT: s_endpgm 151; 152; GFX9-LABEL: simple_write2_two_val_f32_volatile_1: 153; GFX9: ; %bb.0: 154; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 155; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 156; GFX9-NEXT: s_waitcnt lgkmcnt(0) 157; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc 158; GFX9-NEXT: s_waitcnt vmcnt(0) 159; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc 160; GFX9-NEXT: s_waitcnt vmcnt(0) 161; GFX9-NEXT: ds_write_b32 v0, v1 162; GFX9-NEXT: ds_write_b32 v0, v2 offset:32 163; GFX9-NEXT: s_endpgm 164 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 165 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 166 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 167 %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4 168 %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4 169 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 170 store float %val0, float addrspace(3)* %arrayidx0, align 4 171 %add.x = add nsw i32 %x.i, 8 172 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 173 store volatile float %val1, float addrspace(3)* %arrayidx1, align 4 174 ret void 175} 176 177; 2 data subregisters from different super registers. 178; TODO: GFX9 has v_mov_b32_e32 v2, lds@abs32@lo 179; This should be an s_mov_b32. The v_mov_b32 gets introduced by an 180; early legalization of the constant bus constraint on the v_lshl_add_u32, 181; and then SIFoldOperands folds in an unlucky order. 182define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { 183; CI-LABEL: simple_write2_two_val_subreg2_mixed_f32: 184; CI: ; %bb.0: 185; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 186; CI-NEXT: s_mov_b32 s3, 0xf000 187; CI-NEXT: s_mov_b32 s2, 0 188; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 189; CI-NEXT: v_mov_b32_e32 v2, 0 190; CI-NEXT: s_waitcnt lgkmcnt(0) 191; CI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64 glc 192; CI-NEXT: s_waitcnt vmcnt(0) 193; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 offset:8 glc 194; CI-NEXT: s_waitcnt vmcnt(0) 195; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 196; CI-NEXT: s_mov_b32 m0, -1 197; CI-NEXT: ds_write2_b32 v0, v3, v2 offset1:8 198; CI-NEXT: s_endpgm 199; 200; GFX9-LABEL: simple_write2_two_val_subreg2_mixed_f32: 201; GFX9: ; %bb.0: 202; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 203; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 204; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 205; GFX9-NEXT: ; kill: killed $vgpr4 206; GFX9-NEXT: ; kill: killed $sgpr0_sgpr1 207; GFX9-NEXT: s_waitcnt lgkmcnt(0) 208; GFX9-NEXT: global_load_dwordx2 v[1:2], v4, s[0:1] glc 209; GFX9-NEXT: s_waitcnt vmcnt(0) 210; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc 211; GFX9-NEXT: s_waitcnt vmcnt(0) 212; GFX9-NEXT: ds_write2_b32 v0, v1, v3 offset1:8 213; GFX9-NEXT: s_endpgm 214 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 215 %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i 216 %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1 217 %val0 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8 218 %val1 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8 219 %val0.0 = extractelement <2 x float> %val0, i32 0 220 %val1.1 = extractelement <2 x float> %val1, i32 1 221 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 222 store float %val0.0, float addrspace(3)* %arrayidx0, align 4 223 %add.x = add nsw i32 %x.i, 8 224 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 225 store float %val1.1, float addrspace(3)* %arrayidx1, align 4 226 ret void 227} 228 229define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { 230; CI-LABEL: simple_write2_two_val_subreg2_f32: 231; CI: ; %bb.0: 232; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 233; CI-NEXT: s_mov_b32 s3, 0xf000 234; CI-NEXT: s_mov_b32 s2, 0 235; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 236; CI-NEXT: v_mov_b32_e32 v2, 0 237; CI-NEXT: s_waitcnt lgkmcnt(0) 238; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 239; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 240; CI-NEXT: s_mov_b32 m0, -1 241; CI-NEXT: s_waitcnt vmcnt(0) 242; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:8 243; CI-NEXT: s_endpgm 244; 245; GFX9-LABEL: simple_write2_two_val_subreg2_f32: 246; GFX9: ; %bb.0: 247; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 248; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v0 249; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 250; GFX9-NEXT: s_waitcnt lgkmcnt(0) 251; GFX9-NEXT: global_load_dwordx2 v[1:2], v1, s[0:1] 252; GFX9-NEXT: s_waitcnt vmcnt(0) 253; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8 254; GFX9-NEXT: s_endpgm 255 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 256 %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i 257 %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8 258 %val0 = extractelement <2 x float> %val, i32 0 259 %val1 = extractelement <2 x float> %val, i32 1 260 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 261 store float %val0, float addrspace(3)* %arrayidx0, align 4 262 %add.x = add nsw i32 %x.i, 8 263 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 264 store float %val1, float addrspace(3)* %arrayidx1, align 4 265 ret void 266} 267 268define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 { 269; CI-LABEL: simple_write2_two_val_subreg4_f32: 270; CI: ; %bb.0: 271; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 272; CI-NEXT: s_mov_b32 s3, 0xf000 273; CI-NEXT: s_mov_b32 s2, 0 274; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v0 275; CI-NEXT: v_mov_b32_e32 v2, 0 276; CI-NEXT: s_waitcnt lgkmcnt(0) 277; CI-NEXT: buffer_load_dwordx4 v[1:4], v[1:2], s[0:3], 0 addr64 278; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 279; CI-NEXT: s_mov_b32 m0, -1 280; CI-NEXT: s_waitcnt vmcnt(0) 281; CI-NEXT: ds_write2_b32 v0, v1, v4 offset1:8 282; CI-NEXT: s_endpgm 283; 284; GFX9-LABEL: simple_write2_two_val_subreg4_f32: 285; GFX9: ; %bb.0: 286; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 287; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v0 288; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 289; GFX9-NEXT: s_waitcnt lgkmcnt(0) 290; GFX9-NEXT: global_load_dwordx4 v[1:4], v1, s[0:1] 291; GFX9-NEXT: s_waitcnt vmcnt(0) 292; GFX9-NEXT: ds_write2_b32 v0, v1, v4 offset1:8 293; GFX9-NEXT: s_endpgm 294 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 295 %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i 296 %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16 297 %val0 = extractelement <4 x float> %val, i32 0 298 %val1 = extractelement <4 x float> %val, i32 3 299 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 300 store float %val0, float addrspace(3)* %arrayidx0, align 4 301 %add.x = add nsw i32 %x.i, 8 302 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 303 store float %val1, float addrspace(3)* %arrayidx1, align 4 304 ret void 305} 306 307define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { 308; CI-LABEL: simple_write2_two_val_max_offset_f32: 309; CI: ; %bb.0: 310; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 311; CI-NEXT: s_mov_b32 s3, 0xf000 312; CI-NEXT: s_mov_b32 s2, 0 313; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 314; CI-NEXT: v_mov_b32_e32 v1, 0 315; CI-NEXT: s_waitcnt lgkmcnt(0) 316; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc 317; CI-NEXT: s_waitcnt vmcnt(0) 318; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc 319; CI-NEXT: s_waitcnt vmcnt(0) 320; CI-NEXT: s_mov_b32 m0, -1 321; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:255 322; CI-NEXT: s_endpgm 323; 324; GFX9-LABEL: simple_write2_two_val_max_offset_f32: 325; GFX9: ; %bb.0: 326; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 327; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 328; GFX9-NEXT: s_waitcnt lgkmcnt(0) 329; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc 330; GFX9-NEXT: s_waitcnt vmcnt(0) 331; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc 332; GFX9-NEXT: s_waitcnt vmcnt(0) 333; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:255 334; GFX9-NEXT: s_endpgm 335 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 336 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i 337 %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 338 %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4 339 %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4 340 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 341 store float %val0, float addrspace(3)* %arrayidx0, align 4 342 %add.x = add nsw i32 %x.i, 255 343 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 344 store float %val1, float addrspace(3)* %arrayidx1, align 4 345 ret void 346} 347 348define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 349; CI-LABEL: simple_write2_two_val_too_far_f32: 350; CI: ; %bb.0: 351; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 352; CI-NEXT: s_mov_b32 s7, 0xf000 353; CI-NEXT: s_mov_b32 s6, 0 354; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 355; CI-NEXT: v_mov_b32_e32 v1, 0 356; CI-NEXT: s_waitcnt lgkmcnt(0) 357; CI-NEXT: s_mov_b64 s[4:5], s[0:1] 358; CI-NEXT: s_mov_b64 s[0:1], s[2:3] 359; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 360; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 361; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 362; CI-NEXT: s_mov_b32 m0, -1 363; CI-NEXT: s_waitcnt vmcnt(1) 364; CI-NEXT: ds_write_b32 v0, v2 365; CI-NEXT: s_waitcnt vmcnt(0) 366; CI-NEXT: ds_write_b32 v0, v1 offset:1028 367; CI-NEXT: s_endpgm 368; 369; GFX9-LABEL: simple_write2_two_val_too_far_f32: 370; GFX9: ; %bb.0: 371; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 372; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 373; GFX9-NEXT: s_waitcnt lgkmcnt(0) 374; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 375; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 376; GFX9-NEXT: s_waitcnt vmcnt(1) 377; GFX9-NEXT: ds_write_b32 v0, v1 378; GFX9-NEXT: s_waitcnt vmcnt(0) 379; GFX9-NEXT: ds_write_b32 v0, v2 offset:1028 380; GFX9-NEXT: s_endpgm 381 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 382 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 383 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 384 %val0 = load float, float addrspace(1)* %in0.gep, align 4 385 %val1 = load float, float addrspace(1)* %in1.gep, align 4 386 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 387 store float %val0, float addrspace(3)* %arrayidx0, align 4 388 %add.x = add nsw i32 %x.i, 257 389 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 390 store float %val1, float addrspace(3)* %arrayidx1, align 4 391 ret void 392} 393 394define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 395; CI-LABEL: simple_write2_two_val_f32_x2: 396; CI: ; %bb.0: 397; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 398; CI-NEXT: s_mov_b32 s7, 0xf000 399; CI-NEXT: s_mov_b32 s6, 0 400; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 401; CI-NEXT: v_mov_b32_e32 v1, 0 402; CI-NEXT: s_waitcnt lgkmcnt(0) 403; CI-NEXT: s_mov_b64 s[4:5], s[0:1] 404; CI-NEXT: s_mov_b64 s[0:1], s[2:3] 405; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 406; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 407; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 408; CI-NEXT: s_mov_b32 m0, -1 409; CI-NEXT: s_waitcnt vmcnt(0) 410; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:8 411; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:11 offset1:27 412; CI-NEXT: s_endpgm 413; 414; GFX9-LABEL: simple_write2_two_val_f32_x2: 415; GFX9: ; %bb.0: 416; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 417; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 418; GFX9-NEXT: s_waitcnt lgkmcnt(0) 419; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 420; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 421; GFX9-NEXT: s_waitcnt vmcnt(0) 422; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8 423; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27 424; GFX9-NEXT: s_endpgm 425 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 426 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x 427 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x 428 %val0 = load float, float addrspace(1)* %in0.gep, align 4 429 %val1 = load float, float addrspace(1)* %in1.gep, align 4 430 431 %idx.0 = add nsw i32 %tid.x, 0 432 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 433 store float %val0, float addrspace(3)* %arrayidx0, align 4 434 435 %idx.1 = add nsw i32 %tid.x, 8 436 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 437 store float %val1, float addrspace(3)* %arrayidx1, align 4 438 439 %idx.2 = add nsw i32 %tid.x, 11 440 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 441 store float %val0, float addrspace(3)* %arrayidx2, align 4 442 443 %idx.3 = add nsw i32 %tid.x, 27 444 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 445 store float %val1, float addrspace(3)* %arrayidx3, align 4 446 447 ret void 448} 449 450define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 451; CI-LABEL: simple_write2_two_val_f32_x2_nonzero_base: 452; CI: ; %bb.0: 453; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 454; CI-NEXT: s_mov_b32 s7, 0xf000 455; CI-NEXT: s_mov_b32 s6, 0 456; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 457; CI-NEXT: v_mov_b32_e32 v1, 0 458; CI-NEXT: s_waitcnt lgkmcnt(0) 459; CI-NEXT: s_mov_b64 s[4:5], s[0:1] 460; CI-NEXT: s_mov_b64 s[0:1], s[2:3] 461; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 462; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 463; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 464; CI-NEXT: s_mov_b32 m0, -1 465; CI-NEXT: s_waitcnt vmcnt(0) 466; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:3 offset1:8 467; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:11 offset1:27 468; CI-NEXT: s_endpgm 469; 470; GFX9-LABEL: simple_write2_two_val_f32_x2_nonzero_base: 471; GFX9: ; %bb.0: 472; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 473; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 474; GFX9-NEXT: s_waitcnt lgkmcnt(0) 475; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 476; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 477; GFX9-NEXT: s_waitcnt vmcnt(0) 478; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:8 479; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27 480; GFX9-NEXT: s_endpgm 481 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 482 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x 483 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x 484 %val0 = load float, float addrspace(1)* %in0.gep, align 4 485 %val1 = load float, float addrspace(1)* %in1.gep, align 4 486 487 %idx.0 = add nsw i32 %tid.x, 3 488 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 489 store float %val0, float addrspace(3)* %arrayidx0, align 4 490 491 %idx.1 = add nsw i32 %tid.x, 8 492 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 493 store float %val1, float addrspace(3)* %arrayidx1, align 4 494 495 %idx.2 = add nsw i32 %tid.x, 11 496 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 497 store float %val0, float addrspace(3)* %arrayidx2, align 4 498 499 %idx.3 = add nsw i32 %tid.x, 27 500 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 501 store float %val1, float addrspace(3)* %arrayidx3, align 4 502 503 ret void 504} 505 506define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 { 507; CI-LABEL: write2_ptr_subreg_arg_two_val_f32: 508; CI: ; %bb.0: 509; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 510; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf 511; CI-NEXT: s_mov_b32 s3, 0xf000 512; CI-NEXT: s_mov_b32 s2, 0 513; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 514; CI-NEXT: s_waitcnt lgkmcnt(0) 515; CI-NEXT: s_mov_b64 s[0:1], s[4:5] 516; CI-NEXT: v_mov_b32_e32 v1, 0 517; CI-NEXT: s_mov_b64 s[4:5], s[6:7] 518; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 519; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 520; CI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 521; CI-NEXT: v_mov_b32_e32 v1, s8 522; CI-NEXT: s_mov_b32 m0, -1 523; CI-NEXT: v_mov_b32_e32 v3, s9 524; CI-NEXT: s_waitcnt vmcnt(1) 525; CI-NEXT: ds_write_b32 v1, v2 offset:32 526; CI-NEXT: s_waitcnt vmcnt(0) 527; CI-NEXT: ds_write_b32 v3, v0 offset:32 528; CI-NEXT: s_endpgm 529; 530; GFX9-LABEL: write2_ptr_subreg_arg_two_val_f32: 531; GFX9: ; %bb.0: 532; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 533; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c 534; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 535; GFX9-NEXT: s_waitcnt lgkmcnt(0) 536; GFX9-NEXT: global_load_dword v1, v0, s[4:5] 537; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 538; GFX9-NEXT: v_mov_b32_e32 v0, s2 539; GFX9-NEXT: v_mov_b32_e32 v3, s3 540; GFX9-NEXT: s_waitcnt vmcnt(1) 541; GFX9-NEXT: ds_write_b32 v0, v1 offset:32 542; GFX9-NEXT: s_waitcnt vmcnt(0) 543; GFX9-NEXT: ds_write_b32 v3, v2 offset:32 544; GFX9-NEXT: s_endpgm 545 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 546 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 547 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 548 %val0 = load float, float addrspace(1)* %in0.gep, align 4 549 %val1 = load float, float addrspace(1)* %in1.gep, align 4 550 551 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 552 %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 553 %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1 554 %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 555 %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 556 557 ; Apply an additional offset after the vector that will be more obviously folded. 558 %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8 559 store float %val0, float addrspace(3)* %gep.0, align 4 560 561 %add.x = add nsw i32 %x.i, 8 562 store float %val1, float addrspace(3)* %gep.1.offset, align 4 563 ret void 564} 565 566define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { 567; CI-LABEL: simple_write2_one_val_f64: 568; CI: ; %bb.0: 569; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 570; CI-NEXT: s_mov_b32 s3, 0xf000 571; CI-NEXT: s_mov_b32 s2, 0 572; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 573; CI-NEXT: v_mov_b32_e32 v1, 0 574; CI-NEXT: s_waitcnt lgkmcnt(0) 575; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64 576; CI-NEXT: s_mov_b32 m0, -1 577; CI-NEXT: s_waitcnt vmcnt(0) 578; CI-NEXT: ds_write2_b64 v0, v[1:2], v[1:2] offset1:8 579; CI-NEXT: s_endpgm 580; 581; GFX9-LABEL: simple_write2_one_val_f64: 582; GFX9: ; %bb.0: 583; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 584; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 585; GFX9-NEXT: s_waitcnt lgkmcnt(0) 586; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] 587; GFX9-NEXT: s_waitcnt vmcnt(0) 588; GFX9-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:8 589; GFX9-NEXT: s_endpgm 590 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 591 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i 592 %val = load double, double addrspace(1)* %in.gep, align 8 593 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i 594 store double %val, double addrspace(3)* %arrayidx0, align 8 595 %add.x = add nsw i32 %x.i, 8 596 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x 597 store double %val, double addrspace(3)* %arrayidx1, align 8 598 ret void 599} 600 601define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { 602; CI-LABEL: misaligned_simple_write2_one_val_f64: 603; CI: ; %bb.0: 604; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 605; CI-NEXT: s_load_dword s0, s[0:1], 0xd 606; CI-NEXT: s_mov_b32 s7, 0xf000 607; CI-NEXT: s_mov_b32 s6, 0 608; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 609; CI-NEXT: v_mov_b32_e32 v1, 0 610; CI-NEXT: s_waitcnt lgkmcnt(0) 611; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64 612; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 613; CI-NEXT: s_mov_b32 m0, -1 614; CI-NEXT: s_waitcnt vmcnt(0) 615; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 616; CI-NEXT: ds_write2_b32 v0, v1, v2 offset0:14 offset1:15 617; CI-NEXT: s_endpgm 618; 619; GFX9-LABEL: misaligned_simple_write2_one_val_f64: 620; GFX9: ; %bb.0: 621; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 622; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 623; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 624; GFX9-NEXT: s_waitcnt lgkmcnt(0) 625; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 626; GFX9-NEXT: v_add_u32_e32 v2, s4, v2 627; GFX9-NEXT: s_waitcnt vmcnt(0) 628; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 629; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset0:14 offset1:15 630; GFX9-NEXT: s_endpgm 631 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 632 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i 633 %val = load double, double addrspace(1)* %in.gep, align 8 634 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i 635 store double %val, double addrspace(3)* %arrayidx0, align 4 636 %add.x = add nsw i32 %x.i, 7 637 %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x 638 store double %val, double addrspace(3)* %arrayidx1, align 4 639 ret void 640} 641 642define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { 643; CI-LABEL: unaligned_offset_simple_write2_one_val_f64: 644; CI: ; %bb.0: 645; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 646; CI-NEXT: s_load_dword s0, s[0:1], 0xd 647; CI-NEXT: s_mov_b32 s7, 0xf000 648; CI-NEXT: s_mov_b32 s6, 0 649; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 650; CI-NEXT: v_mov_b32_e32 v1, 0 651; CI-NEXT: s_waitcnt lgkmcnt(0) 652; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64 653; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 654; CI-NEXT: s_mov_b32 m0, -1 655; CI-NEXT: s_waitcnt vmcnt(0) 656; CI-NEXT: v_lshrrev_b32_e32 v3, 24, v1 657; CI-NEXT: ds_write_b8 v0, v1 offset:5 658; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 659; CI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 660; CI-NEXT: ds_write_b8 v0, v2 offset:13 661; CI-NEXT: ds_write_b8 v0, v1 offset:9 662; CI-NEXT: v_lshrrev_b32_e32 v1, 24, v2 663; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 664; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 665; CI-NEXT: ds_write_b8 v0, v3 offset:8 666; CI-NEXT: ds_write_b8 v0, v4 offset:7 667; CI-NEXT: ds_write_b8 v0, v5 offset:6 668; CI-NEXT: ds_write_b8 v0, v1 offset:16 669; CI-NEXT: ds_write_b8 v0, v6 offset:15 670; CI-NEXT: ds_write_b8 v0, v2 offset:14 671; CI-NEXT: ds_write_b8 v0, v3 offset:12 672; CI-NEXT: ds_write_b8 v0, v4 offset:11 673; CI-NEXT: ds_write_b8 v0, v5 offset:10 674; CI-NEXT: s_endpgm 675; 676; GFX9-ALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: 677; GFX9-ALIGNED: ; %bb.0: 678; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 679; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x34 680; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 681; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 682; GFX9-ALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 683; GFX9-ALIGNED-NEXT: v_add_u32_e32 v2, s4, v2 684; GFX9-ALIGNED-NEXT: s_waitcnt vmcnt(0) 685; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:7 686; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:5 687; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v3, 24, v0 688; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v0 689; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v1 offset:15 690; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:13 691; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:11 692; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:9 693; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v0, 24, v1 694; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v1, 8, v1 695; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v3 offset:8 696; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v4 offset:6 697; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:16 698; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:14 699; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v3 offset:12 700; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v4 offset:10 701; GFX9-ALIGNED-NEXT: s_endpgm 702; 703; GFX9-UNALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: 704; GFX9-UNALIGNED: ; %bb.0: 705; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 706; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x34 707; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 708; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 709; GFX9-UNALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 710; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, s4, v2 711; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v3, 5, v2 712; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, 9, v2 713; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) 714; GFX9-UNALIGNED-NEXT: ds_write2_b32 v3, v0, v1 offset1:1 715; GFX9-UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 716; GFX9-UNALIGNED-NEXT: s_endpgm 717 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 718 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i 719 %val = load double, double addrspace(1)* %in.gep, align 8 720 %base = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i 721 %base.i8 = bitcast double addrspace(3)* %base to i8 addrspace(3)* 722 %addr0.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 5 723 %addr0 = bitcast i8 addrspace(3)* %addr0.i8 to double addrspace(3)* 724 store double %val, double addrspace(3)* %addr0, align 1 725 %addr1.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 9 726 %addr1 = bitcast i8 addrspace(3)* %addr1.i8 to double addrspace(3)* 727 store double %val, double addrspace(3)* %addr1, align 1 728 ret void 729} 730 731define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { 732; CI-LABEL: simple_write2_two_val_f64: 733; CI: ; %bb.0: 734; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 735; CI-NEXT: s_mov_b32 s3, 0xf000 736; CI-NEXT: s_mov_b32 s2, 0 737; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 738; CI-NEXT: v_mov_b32_e32 v1, 0 739; CI-NEXT: s_waitcnt lgkmcnt(0) 740; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc 741; CI-NEXT: s_waitcnt vmcnt(0) 742; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 glc 743; CI-NEXT: s_waitcnt vmcnt(0) 744; CI-NEXT: s_mov_b32 m0, -1 745; CI-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:8 746; CI-NEXT: s_endpgm 747; 748; GFX9-LABEL: simple_write2_two_val_f64: 749; GFX9: ; %bb.0: 750; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 751; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 752; GFX9-NEXT: s_waitcnt lgkmcnt(0) 753; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc 754; GFX9-NEXT: s_waitcnt vmcnt(0) 755; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc 756; GFX9-NEXT: s_waitcnt vmcnt(0) 757; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:8 758; GFX9-NEXT: s_endpgm 759 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 760 %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i 761 %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1 762 %val0 = load volatile double, double addrspace(1)* %in.gep.0, align 8 763 %val1 = load volatile double, double addrspace(1)* %in.gep.1, align 8 764 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i 765 store double %val0, double addrspace(3)* %arrayidx0, align 8 766 %add.x = add nsw i32 %x.i, 8 767 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x 768 store double %val1, double addrspace(3)* %arrayidx1, align 8 769 ret void 770} 771 772@foo = addrspace(3) global [4 x i32] undef, align 4 773 774define amdgpu_kernel void @store_constant_adjacent_offsets() { 775; CI-LABEL: store_constant_adjacent_offsets: 776; CI: ; %bb.0: 777; CI-NEXT: v_mov_b32_e32 v0, 0x7b 778; CI-NEXT: v_mov_b32_e32 v1, v0 779; CI-NEXT: v_mov_b32_e32 v2, 0 780; CI-NEXT: s_mov_b32 m0, -1 781; CI-NEXT: ds_write_b64 v2, v[0:1] 782; CI-NEXT: s_endpgm 783; 784; GFX9-LABEL: store_constant_adjacent_offsets: 785; GFX9: ; %bb.0: 786; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b 787; GFX9-NEXT: v_mov_b32_e32 v1, v0 788; GFX9-NEXT: v_mov_b32_e32 v2, 0 789; GFX9-NEXT: ds_write_b64 v2, v[0:1] 790; GFX9-NEXT: s_endpgm 791 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 792 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 793 ret void 794} 795 796define amdgpu_kernel void @store_constant_disjoint_offsets() { 797; CI-LABEL: store_constant_disjoint_offsets: 798; CI: ; %bb.0: 799; CI-NEXT: v_mov_b32_e32 v0, 0x7b 800; CI-NEXT: v_mov_b32_e32 v1, 0 801; CI-NEXT: s_mov_b32 m0, -1 802; CI-NEXT: ds_write2_b32 v1, v0, v0 offset1:2 803; CI-NEXT: s_endpgm 804; 805; GFX9-LABEL: store_constant_disjoint_offsets: 806; GFX9: ; %bb.0: 807; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b 808; GFX9-NEXT: v_mov_b32_e32 v1, 0 809; GFX9-NEXT: ds_write2_b32 v1, v0, v0 offset1:2 810; GFX9-NEXT: s_endpgm 811 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 812 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 813 ret void 814} 815 816@bar = addrspace(3) global [4 x i64] undef, align 4 817 818define amdgpu_kernel void @store_misaligned64_constant_offsets() { 819; CI-LABEL: store_misaligned64_constant_offsets: 820; CI: ; %bb.0: 821; CI-NEXT: v_mov_b32_e32 v0, 0x7b 822; CI-NEXT: v_mov_b32_e32 v1, 0 823; CI-NEXT: v_mov_b32_e32 v2, v0 824; CI-NEXT: v_mov_b32_e32 v3, v1 825; CI-NEXT: s_mov_b32 m0, -1 826; CI-NEXT: ds_write_b128 v1, v[0:3] 827; CI-NEXT: s_endpgm 828; 829; GFX9-LABEL: store_misaligned64_constant_offsets: 830; GFX9: ; %bb.0: 831; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b 832; GFX9-NEXT: v_mov_b32_e32 v1, 0 833; GFX9-NEXT: v_mov_b32_e32 v2, v0 834; GFX9-NEXT: v_mov_b32_e32 v3, v1 835; GFX9-NEXT: ds_write_b128 v1, v[0:3] 836; GFX9-NEXT: s_endpgm 837 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 838 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 839 ret void 840} 841 842@bar.large = addrspace(3) global [4096 x i64] undef, align 4 843 844define amdgpu_kernel void @store_misaligned64_constant_large_offsets() { 845; CI-LABEL: store_misaligned64_constant_large_offsets: 846; CI: ; %bb.0: 847; CI-NEXT: s_mov_b64 s[0:1], 0x7b 848; CI-NEXT: v_mov_b32_e32 v0, s0 849; CI-NEXT: v_mov_b32_e32 v2, 0 850; CI-NEXT: v_mov_b32_e32 v1, s1 851; CI-NEXT: s_mov_b32 m0, -1 852; CI-NEXT: ds_write_b64 v2, v[0:1] offset:16384 853; CI-NEXT: ds_write_b64 v2, v[0:1] offset:32760 854; CI-NEXT: s_endpgm 855; 856; GFX9-LABEL: store_misaligned64_constant_large_offsets: 857; GFX9: ; %bb.0: 858; GFX9-NEXT: s_mov_b64 s[0:1], 0x7b 859; GFX9-NEXT: v_mov_b32_e32 v0, s0 860; GFX9-NEXT: v_mov_b32_e32 v2, 0 861; GFX9-NEXT: v_mov_b32_e32 v1, s1 862; GFX9-NEXT: ds_write_b64 v2, v[0:1] offset:16384 863; GFX9-NEXT: ds_write_b64 v2, v[0:1] offset:32760 864; GFX9-NEXT: s_endpgm 865 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 866 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 867 ret void 868} 869 870@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4 871@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4 872 873define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 { 874; CI-LABEL: write2_sgemm_sequence: 875; CI: ; %bb.0: 876; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 877; CI-NEXT: s_mov_b32 m0, -1 878; CI-NEXT: s_waitcnt lgkmcnt(0) 879; CI-NEXT: s_load_dword s0, s[0:1], 0x0 880; CI-NEXT: s_lshl_b32 s1, s2, 2 881; CI-NEXT: s_add_i32 s2, s1, 0xc20 882; CI-NEXT: s_addk_i32 s1, 0xc60 883; CI-NEXT: v_mov_b32_e32 v0, s2 884; CI-NEXT: s_waitcnt lgkmcnt(0) 885; CI-NEXT: v_mov_b32_e32 v2, s0 886; CI-NEXT: v_mov_b32_e32 v3, s0 887; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 888; CI-NEXT: v_mov_b32_e32 v0, s1 889; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 890; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v1 891; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 892; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:32 offset1:33 893; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:64 offset1:65 894; CI-NEXT: s_endpgm 895; 896; GFX9-LABEL: write2_sgemm_sequence: 897; GFX9: ; %bb.0: 898; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 899; GFX9-NEXT: s_lshl_b32 s2, s2, 2 900; GFX9-NEXT: s_waitcnt lgkmcnt(0) 901; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 902; GFX9-NEXT: s_add_i32 s1, s2, 0xc20 903; GFX9-NEXT: s_addk_i32 s2, 0xc60 904; GFX9-NEXT: v_mov_b32_e32 v0, s1 905; GFX9-NEXT: v_mov_b32_e32 v2, s2 906; GFX9-NEXT: s_waitcnt lgkmcnt(0) 907; GFX9-NEXT: v_mov_b32_e32 v3, s0 908; GFX9-NEXT: v_mov_b32_e32 v4, s0 909; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset1:1 910; GFX9-NEXT: ds_write2_b32 v2, v3, v4 offset1:1 911; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v1 912; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset1:1 913; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset0:32 offset1:33 914; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset0:64 offset1:65 915; GFX9-NEXT: s_endpgm 916 %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1 917 %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1 918 %val = load float, float addrspace(1)* %in 919 %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i 920 store float %val, float addrspace(3)* %arrayidx44, align 4 921 %add47 = add nsw i32 %x.i, 1 922 %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47 923 store float %val, float addrspace(3)* %arrayidx48, align 4 924 %add51 = add nsw i32 %x.i, 16 925 %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51 926 store float %val, float addrspace(3)* %arrayidx52, align 4 927 %add55 = add nsw i32 %x.i, 17 928 %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55 929 store float %val, float addrspace(3)* %arrayidx56, align 4 930 %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i 931 store float %val, float addrspace(3)* %arrayidx60, align 4 932 %add63 = add nsw i32 %y.i, 1 933 %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63 934 store float %val, float addrspace(3)* %arrayidx64, align 4 935 %add67 = add nsw i32 %y.i, 32 936 %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67 937 store float %val, float addrspace(3)* %arrayidx68, align 4 938 %add71 = add nsw i32 %y.i, 33 939 %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71 940 store float %val, float addrspace(3)* %arrayidx72, align 4 941 %add75 = add nsw i32 %y.i, 64 942 %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75 943 store float %val, float addrspace(3)* %arrayidx76, align 4 944 %add79 = add nsw i32 %y.i, 65 945 %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79 946 store float %val, float addrspace(3)* %arrayidx80, align 4 947 ret void 948} 949 950define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 { 951; CI-LABEL: simple_write2_v4f32_superreg_align4: 952; CI: ; %bb.0: 953; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 954; CI-NEXT: s_load_dword s4, s[0:1], 0x9 955; CI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 956; CI-NEXT: s_mov_b32 m0, -1 957; CI-NEXT: s_waitcnt lgkmcnt(0) 958; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 959; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 960; CI-NEXT: s_waitcnt lgkmcnt(0) 961; CI-NEXT: v_mov_b32_e32 v1, s0 962; CI-NEXT: v_mov_b32_e32 v2, s1 963; CI-NEXT: v_mov_b32_e32 v3, s2 964; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 965; CI-NEXT: v_mov_b32_e32 v1, s3 966; CI-NEXT: ds_write2_b32 v0, v3, v1 offset0:2 offset1:3 967; CI-NEXT: s_endpgm 968; 969; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4: 970; GFX9-ALIGNED: ; %bb.0: 971; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 972; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x24 973; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 974; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4 975; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 976; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 977; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, s0 978; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, s1 979; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v3, s2 980; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v4, s3 981; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 982; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v3, v4 offset0:2 offset1:3 983; GFX9-ALIGNED-NEXT: s_endpgm 984; 985; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4: 986; GFX9-UNALIGNED: ; %bb.0: 987; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 988; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x24 989; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 990; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v4, v0, 4, s4 991; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 992; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 993; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 994; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s2 995; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 996; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s3 997; GFX9-UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 998; GFX9-UNALIGNED-NEXT: s_endpgm 999 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 1000 %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in 1001 %val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4 1002 %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(3)* %out, i32 %x.i 1003 store <4 x float> %val0, <4 x float> addrspace(3)* %out.gep, align 4 1004 ret void 1005} 1006 1007@v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1 1008 1009define amdgpu_kernel void @write2_v2i32_align1_odd_offset() { 1010; CI-LABEL: write2_v2i32_align1_odd_offset: 1011; CI: ; %bb.0: ; %entry 1012; CI-NEXT: v_mov_b32_e32 v0, 0x7b 1013; CI-NEXT: v_mov_b32_e32 v1, 0 1014; CI-NEXT: s_mov_b32 m0, -1 1015; CI-NEXT: ds_write_b8 v1, v0 offset:65 1016; CI-NEXT: v_mov_b32_e32 v0, 1 1017; CI-NEXT: ds_write_b8 v1, v0 offset:70 1018; CI-NEXT: v_mov_b32_e32 v0, 0xc8 1019; CI-NEXT: ds_write_b8 v1, v0 offset:69 1020; CI-NEXT: ds_write_b8 v1, v1 offset:68 1021; CI-NEXT: ds_write_b8 v1, v1 offset:67 1022; CI-NEXT: ds_write_b8 v1, v1 offset:66 1023; CI-NEXT: ds_write_b8 v1, v1 offset:72 1024; CI-NEXT: ds_write_b8 v1, v1 offset:71 1025; CI-NEXT: s_endpgm 1026; 1027; GFX9-ALIGNED-LABEL: write2_v2i32_align1_odd_offset: 1028; GFX9-ALIGNED: ; %bb.0: ; %entry 1029; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b 1030; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, 0 1031; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:65 1032; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 1 1033; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:70 1034; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0xc8 1035; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:69 1036; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:68 1037; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:67 1038; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:66 1039; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:72 1040; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:71 1041; GFX9-ALIGNED-NEXT: s_endpgm 1042; 1043; GFX9-UNALIGNED-LABEL: write2_v2i32_align1_odd_offset: 1044; GFX9-UNALIGNED: ; %bb.0: ; %entry 1045; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x41 1046; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0x7b 1047; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x1c8 1048; GFX9-UNALIGNED-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 1049; GFX9-UNALIGNED-NEXT: s_endpgm 1050entry: 1051 store <2 x i32> <i32 123, i32 456>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1 1052 ret void 1053} 1054 1055declare i32 @llvm.amdgcn.workgroup.id.x() #1 1056declare i32 @llvm.amdgcn.workgroup.id.y() #1 1057declare i32 @llvm.amdgcn.workitem.id.x() #1 1058declare i32 @llvm.amdgcn.workitem.id.y() #1 1059 1060attributes #0 = { nounwind } 1061attributes #1 = { nounwind readnone speculatable } 1062attributes #2 = { convergent nounwind } 1063