1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope --check-prefix=CI %s 3; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-ALIGNED %s 4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-UNALIGNED %s 5 6; FIXME: We don't get cases where the address was an SGPR because we 7; get a copy to the address register for each one. 8 9@lds = addrspace(3) global [512 x float] undef, align 4 10@lds.f64 = addrspace(3) global [512 x double] undef, align 8 11 12define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 { 13; CI-LABEL: simple_read2_f32: 14; CI: ; %bb.0: 15; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 16; CI-NEXT: s_mov_b32 m0, -1 17; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 18; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 19; CI-NEXT: s_mov_b32 s3, 0xf000 20; CI-NEXT: s_mov_b32 s2, 0 21; CI-NEXT: s_waitcnt lgkmcnt(0) 22; CI-NEXT: v_add_f32_e32 v2, v1, v2 23; CI-NEXT: v_mov_b32_e32 v1, 0 24; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 25; CI-NEXT: s_endpgm 26; 27; GFX9-LABEL: simple_read2_f32: 28; GFX9: ; %bb.0: 29; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 30; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8 31; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 32; GFX9-NEXT: s_waitcnt lgkmcnt(0) 33; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 34; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 35; GFX9-NEXT: s_endpgm 36 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 37 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 38 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 39 %add.x = add nsw i32 %x.i, 8 40 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 41 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 42 %sum = fadd float %val0, %val1 43 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 44 store float %sum, float addrspace(1)* %out.gep, align 4 45 ret void 46} 47 48define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 { 49; CI-LABEL: simple_read2_f32_max_offset: 50; CI: ; %bb.0: 51; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 52; CI-NEXT: s_mov_b32 m0, -1 53; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:255 54; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 55; CI-NEXT: s_mov_b32 s3, 0xf000 56; CI-NEXT: s_mov_b32 s2, 0 57; CI-NEXT: s_waitcnt lgkmcnt(0) 58; CI-NEXT: v_add_f32_e32 v2, v1, v2 59; CI-NEXT: v_mov_b32_e32 v1, 0 60; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 61; CI-NEXT: s_endpgm 62; 63; GFX9-LABEL: simple_read2_f32_max_offset: 64; GFX9: ; %bb.0: 65; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 66; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:255 67; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 68; GFX9-NEXT: s_waitcnt lgkmcnt(0) 69; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 70; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 71; GFX9-NEXT: s_endpgm 72 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 73 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 74 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 75 %add.x = add nsw i32 %x.i, 255 76 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 77 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 78 %sum = fadd float %val0, %val1 79 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 80 store float %sum, float addrspace(1)* %out.gep, align 4 81 ret void 82} 83 84define amdgpu_kernel void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 { 85; CI-LABEL: simple_read2_f32_too_far: 86; CI: ; %bb.0: 87; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 88; CI-NEXT: s_mov_b32 m0, -1 89; CI-NEXT: ds_read_b32 v1, v0 90; CI-NEXT: ds_read_b32 v2, v0 offset:1028 91; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 92; CI-NEXT: s_mov_b32 s3, 0xf000 93; CI-NEXT: s_mov_b32 s2, 0 94; CI-NEXT: s_waitcnt lgkmcnt(0) 95; CI-NEXT: v_add_f32_e32 v2, v1, v2 96; CI-NEXT: v_mov_b32_e32 v1, 0 97; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 98; CI-NEXT: s_endpgm 99; 100; GFX9-LABEL: simple_read2_f32_too_far: 101; GFX9: ; %bb.0: 102; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 103; GFX9-NEXT: ds_read_b32 v1, v0 104; GFX9-NEXT: ds_read_b32 v2, v0 offset:1028 105; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 106; GFX9-NEXT: s_waitcnt lgkmcnt(0) 107; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 108; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 109; GFX9-NEXT: s_endpgm 110 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 111 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 112 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 113 %add.x = add nsw i32 %x.i, 257 114 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 115 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 116 %sum = fadd float %val0, %val1 117 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 118 store float %sum, float addrspace(1)* %out.gep, align 4 119 ret void 120} 121 122define amdgpu_kernel void @simple_read2_f32_x2(float addrspace(1)* %out) #0 { 123; CI-LABEL: simple_read2_f32_x2: 124; CI: ; %bb.0: 125; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 126; CI-NEXT: s_mov_b32 m0, -1 127; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 128; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 129; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 130; CI-NEXT: s_mov_b32 s3, 0xf000 131; CI-NEXT: s_mov_b32 s2, 0 132; CI-NEXT: s_waitcnt lgkmcnt(0) 133; CI-NEXT: v_add_f32_e32 v1, v1, v2 134; CI-NEXT: v_add_f32_e32 v2, v3, v4 135; CI-NEXT: v_add_f32_e32 v2, v1, v2 136; CI-NEXT: v_mov_b32_e32 v1, 0 137; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 138; CI-NEXT: s_endpgm 139; 140; GFX9-LABEL: simple_read2_f32_x2: 141; GFX9: ; %bb.0: 142; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 143; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset1:8 144; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 145; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 146; GFX9-NEXT: s_waitcnt lgkmcnt(0) 147; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 148; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 149; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 150; GFX9-NEXT: global_store_dword v4, v0, s[0:1] 151; GFX9-NEXT: s_endpgm 152 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 153 %idx.0 = add nsw i32 %tid.x, 0 154 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 155 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 156 157 %idx.1 = add nsw i32 %tid.x, 8 158 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 159 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 160 %sum.0 = fadd float %val0, %val1 161 162 %idx.2 = add nsw i32 %tid.x, 11 163 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 164 %val2 = load float, float addrspace(3)* %arrayidx2, align 4 165 166 %idx.3 = add nsw i32 %tid.x, 27 167 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 168 %val3 = load float, float addrspace(3)* %arrayidx3, align 4 169 %sum.1 = fadd float %val2, %val3 170 171 %sum = fadd float %sum.0, %sum.1 172 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0 173 store float %sum, float addrspace(1)* %out.gep, align 4 174 ret void 175} 176 177; Make sure there is an instruction between the two sets of reads. 178define amdgpu_kernel void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 { 179; CI-LABEL: simple_read2_f32_x2_barrier: 180; CI: ; %bb.0: 181; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 182; CI-NEXT: s_mov_b32 m0, -1 183; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 184; CI-NEXT: s_waitcnt lgkmcnt(0) 185; CI-NEXT: s_barrier 186; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 187; CI-NEXT: s_mov_b32 s3, 0xf000 188; CI-NEXT: s_mov_b32 s2, 0 189; CI-NEXT: v_add_f32_e32 v3, v1, v2 190; CI-NEXT: ds_read2_b32 v[1:2], v0 offset0:11 offset1:27 191; CI-NEXT: s_waitcnt lgkmcnt(0) 192; CI-NEXT: v_add_f32_e32 v1, v1, v2 193; CI-NEXT: v_add_f32_e32 v2, v3, v1 194; CI-NEXT: v_mov_b32_e32 v1, 0 195; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 196; CI-NEXT: s_endpgm 197; 198; GFX9-LABEL: simple_read2_f32_x2_barrier: 199; GFX9: ; %bb.0: 200; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 201; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset1:8 202; GFX9-NEXT: s_waitcnt lgkmcnt(0) 203; GFX9-NEXT: s_barrier 204; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 205; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 206; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 207; GFX9-NEXT: s_waitcnt lgkmcnt(0) 208; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 209; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 210; GFX9-NEXT: global_store_dword v4, v0, s[0:1] 211; GFX9-NEXT: s_endpgm 212 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 213 %idx.0 = add nsw i32 %tid.x, 0 214 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 215 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 216 217 %idx.1 = add nsw i32 %tid.x, 8 218 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 219 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 220 %sum.0 = fadd float %val0, %val1 221 222 call void @llvm.amdgcn.s.barrier() #2 223 224 %idx.2 = add nsw i32 %tid.x, 11 225 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 226 %val2 = load float, float addrspace(3)* %arrayidx2, align 4 227 228 %idx.3 = add nsw i32 %tid.x, 27 229 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 230 %val3 = load float, float addrspace(3)* %arrayidx3, align 4 231 %sum.1 = fadd float %val2, %val3 232 233 %sum = fadd float %sum.0, %sum.1 234 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0 235 store float %sum, float addrspace(1)* %out.gep, align 4 236 ret void 237} 238 239; For some reason adding something to the base address for the first 240; element results in only folding the inner pair. 241define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 { 242; CI-LABEL: simple_read2_f32_x2_nonzero_base: 243; CI: ; %bb.0: 244; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 245; CI-NEXT: s_mov_b32 m0, -1 246; CI-NEXT: ds_read2_b32 v[1:2], v0 offset0:2 offset1:8 247; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 248; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 249; CI-NEXT: s_mov_b32 s3, 0xf000 250; CI-NEXT: s_mov_b32 s2, 0 251; CI-NEXT: s_waitcnt lgkmcnt(0) 252; CI-NEXT: v_add_f32_e32 v1, v1, v2 253; CI-NEXT: v_add_f32_e32 v2, v3, v4 254; CI-NEXT: v_add_f32_e32 v2, v1, v2 255; CI-NEXT: v_mov_b32_e32 v1, 0 256; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:8 257; CI-NEXT: s_endpgm 258; 259; GFX9-LABEL: simple_read2_f32_x2_nonzero_base: 260; GFX9: ; %bb.0: 261; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 262; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset0:2 offset1:8 263; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 264; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 265; GFX9-NEXT: s_waitcnt lgkmcnt(0) 266; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 267; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 268; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 269; GFX9-NEXT: global_store_dword v4, v0, s[0:1] offset:8 270; GFX9-NEXT: s_endpgm 271 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 272 %idx.0 = add nsw i32 %tid.x, 2 273 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 274 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 275 276 %idx.1 = add nsw i32 %tid.x, 8 277 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 278 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 279 %sum.0 = fadd float %val0, %val1 280 281 %idx.2 = add nsw i32 %tid.x, 11 282 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 283 %val2 = load float, float addrspace(3)* %arrayidx2, align 4 284 285 %idx.3 = add nsw i32 %tid.x, 27 286 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 287 %val3 = load float, float addrspace(3)* %arrayidx3, align 4 288 %sum.1 = fadd float %val2, %val3 289 290 %sum = fadd float %sum.0, %sum.1 291 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0 292 store float %sum, float addrspace(1)* %out.gep, align 4 293 ret void 294} 295 296; Be careful of vectors of pointers. We don't know if the 2 pointers 297; in the vectors are really the same base, so this is not safe to 298; merge. 299; Base pointers come from different subregister of same super 300; register. We can't safely merge this. 301define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { 302; CI-LABEL: read2_ptr_is_subreg_arg_f32: 303; CI: ; %bb.0: 304; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 305; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 306; CI-NEXT: s_mov_b32 m0, -1 307; CI-NEXT: s_mov_b32 s7, 0xf000 308; CI-NEXT: s_mov_b32 s6, 0 309; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 310; CI-NEXT: s_waitcnt lgkmcnt(0) 311; CI-NEXT: v_mov_b32_e32 v1, s0 312; CI-NEXT: v_mov_b32_e32 v2, s1 313; CI-NEXT: ds_read_b32 v1, v1 offset:32 314; CI-NEXT: ds_read_b32 v2, v2 315; CI-NEXT: s_waitcnt lgkmcnt(0) 316; CI-NEXT: v_add_f32_e32 v2, v1, v2 317; CI-NEXT: v_mov_b32_e32 v1, 0 318; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 319; CI-NEXT: s_endpgm 320; 321; GFX9-LABEL: read2_ptr_is_subreg_arg_f32: 322; GFX9: ; %bb.0: 323; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 324; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 325; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 326; GFX9-NEXT: s_waitcnt lgkmcnt(0) 327; GFX9-NEXT: v_mov_b32_e32 v1, s4 328; GFX9-NEXT: v_mov_b32_e32 v2, s5 329; GFX9-NEXT: ds_read_b32 v1, v1 offset:32 330; GFX9-NEXT: ds_read_b32 v2, v2 331; GFX9-NEXT: s_waitcnt lgkmcnt(0) 332; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 333; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 334; GFX9-NEXT: s_endpgm 335 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 336 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 337 %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 338 %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1 339 %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 340 %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 341 %val0 = load float, float addrspace(3)* %gep.0, align 4 342 %val1 = load float, float addrspace(3)* %gep.1, align 4 343 %add.x = add nsw i32 %x.i, 8 344 %sum = fadd float %val0, %val1 345 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 346 store float %sum, float addrspace(1)* %out.gep, align 4 347 ret void 348} 349 350; Apply a constant scalar offset after the pointer vector extract. We 351; are rejecting merges that have the same, constant 0 offset, so make 352; sure we are really rejecting it because of the different 353; subregisters. 354define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { 355; CI-LABEL: read2_ptr_is_subreg_arg_offset_f32: 356; CI: ; %bb.0: 357; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 358; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 359; CI-NEXT: s_mov_b32 m0, -1 360; CI-NEXT: s_mov_b32 s7, 0xf000 361; CI-NEXT: s_mov_b32 s6, 0 362; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 363; CI-NEXT: s_waitcnt lgkmcnt(0) 364; CI-NEXT: v_mov_b32_e32 v1, s0 365; CI-NEXT: v_mov_b32_e32 v2, s1 366; CI-NEXT: ds_read_b32 v1, v1 offset:32 367; CI-NEXT: ds_read_b32 v2, v2 offset:32 368; CI-NEXT: s_waitcnt lgkmcnt(0) 369; CI-NEXT: v_add_f32_e32 v2, v1, v2 370; CI-NEXT: v_mov_b32_e32 v1, 0 371; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 372; CI-NEXT: s_endpgm 373; 374; GFX9-LABEL: read2_ptr_is_subreg_arg_offset_f32: 375; GFX9: ; %bb.0: 376; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 377; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 378; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 379; GFX9-NEXT: s_waitcnt lgkmcnt(0) 380; GFX9-NEXT: v_mov_b32_e32 v1, s4 381; GFX9-NEXT: v_mov_b32_e32 v2, s5 382; GFX9-NEXT: ds_read_b32 v1, v1 offset:32 383; GFX9-NEXT: ds_read_b32 v2, v2 offset:32 384; GFX9-NEXT: s_waitcnt lgkmcnt(0) 385; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 386; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 387; GFX9-NEXT: s_endpgm 388 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 389 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 390 %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 391 %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1 392 %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 393 %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 394 395 ; Apply an additional offset after the vector that will be more obviously folded. 396 %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8 397 398 %val0 = load float, float addrspace(3)* %gep.0, align 4 399 %val1 = load float, float addrspace(3)* %gep.1.offset, align 4 400 %add.x = add nsw i32 %x.i, 8 401 %sum = fadd float %val0, %val1 402 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 403 store float %sum, float addrspace(1)* %out.gep, align 4 404 ret void 405} 406 407define amdgpu_kernel void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 { 408; CI-LABEL: read2_ptr_is_subreg_f32: 409; CI: ; %bb.0: 410; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 411; CI-NEXT: s_mov_b32 m0, -1 412; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 413; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 414; CI-NEXT: s_mov_b32 s3, 0xf000 415; CI-NEXT: s_mov_b32 s2, 0 416; CI-NEXT: s_waitcnt lgkmcnt(0) 417; CI-NEXT: v_add_f32_e32 v2, v1, v2 418; CI-NEXT: v_mov_b32_e32 v1, 0 419; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 420; CI-NEXT: s_endpgm 421; 422; GFX9-LABEL: read2_ptr_is_subreg_f32: 423; GFX9: ; %bb.0: 424; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 425; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8 426; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 427; GFX9-NEXT: s_waitcnt lgkmcnt(0) 428; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 429; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 430; GFX9-NEXT: s_endpgm 431 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 432 %ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0 433 %ptr.1 = insertelement <2 x [512 x float] addrspace(3)*> %ptr.0, [512 x float] addrspace(3)* @lds, i32 1 434 %x.i.v.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 435 %x.i.v.1 = insertelement <2 x i32> %x.i.v.0, i32 %x.i, i32 1 436 %idx = add <2 x i32> %x.i.v.1, <i32 0, i32 8> 437 %gep = getelementptr inbounds [512 x float], <2 x [512 x float] addrspace(3)*> %ptr.1, <2 x i32> <i32 0, i32 0>, <2 x i32> %idx 438 %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 439 %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 440 %val0 = load float, float addrspace(3)* %gep.0, align 4 441 %val1 = load float, float addrspace(3)* %gep.1, align 4 442 %add.x = add nsw i32 %x.i, 8 443 %sum = fadd float %val0, %val1 444 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 445 store float %sum, float addrspace(1)* %out.gep, align 4 446 ret void 447} 448 449define amdgpu_kernel void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 { 450; CI-LABEL: simple_read2_f32_volatile_0: 451; CI: ; %bb.0: 452; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 453; CI-NEXT: s_mov_b32 m0, -1 454; CI-NEXT: ds_read_b32 v1, v0 455; CI-NEXT: ds_read_b32 v2, v0 offset:32 456; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 457; CI-NEXT: s_mov_b32 s3, 0xf000 458; CI-NEXT: s_mov_b32 s2, 0 459; CI-NEXT: s_waitcnt lgkmcnt(0) 460; CI-NEXT: v_add_f32_e32 v2, v1, v2 461; CI-NEXT: v_mov_b32_e32 v1, 0 462; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 463; CI-NEXT: s_endpgm 464; 465; GFX9-LABEL: simple_read2_f32_volatile_0: 466; GFX9: ; %bb.0: 467; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 468; GFX9-NEXT: ds_read_b32 v1, v0 469; GFX9-NEXT: ds_read_b32 v2, v0 offset:32 470; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 471; GFX9-NEXT: s_waitcnt lgkmcnt(0) 472; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 473; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 474; GFX9-NEXT: s_endpgm 475 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 476 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 477 %val0 = load volatile float, float addrspace(3)* %arrayidx0, align 4 478 %add.x = add nsw i32 %x.i, 8 479 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 480 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 481 %sum = fadd float %val0, %val1 482 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 483 store float %sum, float addrspace(1)* %out.gep, align 4 484 ret void 485} 486 487define amdgpu_kernel void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 { 488; CI-LABEL: simple_read2_f32_volatile_1: 489; CI: ; %bb.0: 490; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 491; CI-NEXT: s_mov_b32 m0, -1 492; CI-NEXT: ds_read_b32 v1, v0 493; CI-NEXT: ds_read_b32 v2, v0 offset:32 494; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 495; CI-NEXT: s_mov_b32 s3, 0xf000 496; CI-NEXT: s_mov_b32 s2, 0 497; CI-NEXT: s_waitcnt lgkmcnt(0) 498; CI-NEXT: v_add_f32_e32 v2, v1, v2 499; CI-NEXT: v_mov_b32_e32 v1, 0 500; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 501; CI-NEXT: s_endpgm 502; 503; GFX9-LABEL: simple_read2_f32_volatile_1: 504; GFX9: ; %bb.0: 505; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 506; GFX9-NEXT: ds_read_b32 v1, v0 507; GFX9-NEXT: ds_read_b32 v2, v0 offset:32 508; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 509; GFX9-NEXT: s_waitcnt lgkmcnt(0) 510; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 511; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 512; GFX9-NEXT: s_endpgm 513 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 514 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 515 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 516 %add.x = add nsw i32 %x.i, 8 517 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 518 %val1 = load volatile float, float addrspace(3)* %arrayidx1, align 4 519 %sum = fadd float %val0, %val1 520 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 521 store float %sum, float addrspace(1)* %out.gep, align 4 522 ret void 523} 524 525; Can't fold since not correctly aligned. 526define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { 527; CI-LABEL: unaligned_read2_f32: 528; CI: ; %bb.0: 529; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 530; CI-NEXT: s_load_dword s0, s[0:1], 0xb 531; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 532; CI-NEXT: s_mov_b32 m0, -1 533; CI-NEXT: s_mov_b32 s7, 0xf000 534; CI-NEXT: s_mov_b32 s6, 0 535; CI-NEXT: s_waitcnt lgkmcnt(0) 536; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 537; CI-NEXT: ds_read_u8 v2, v1 offset:35 538; CI-NEXT: ds_read_u8 v3, v1 offset:34 539; CI-NEXT: ds_read_u8 v4, v1 offset:33 540; CI-NEXT: ds_read_u8 v5, v1 offset:32 541; CI-NEXT: ds_read_u8 v6, v1 offset:3 542; CI-NEXT: ds_read_u8 v7, v1 offset:2 543; CI-NEXT: ds_read_u8 v8, v1 offset:1 544; CI-NEXT: ds_read_u8 v1, v1 545; CI-NEXT: s_waitcnt lgkmcnt(7) 546; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 547; CI-NEXT: s_waitcnt lgkmcnt(3) 548; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 549; CI-NEXT: s_waitcnt lgkmcnt(2) 550; CI-NEXT: v_or_b32_e32 v6, v6, v7 551; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 552; CI-NEXT: v_or_b32_e32 v2, v2, v3 553; CI-NEXT: s_waitcnt lgkmcnt(1) 554; CI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 555; CI-NEXT: s_waitcnt lgkmcnt(0) 556; CI-NEXT: v_or_b32_e32 v1, v8, v1 557; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 558; CI-NEXT: v_or_b32_e32 v4, v4, v5 559; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 560; CI-NEXT: v_or_b32_e32 v1, v6, v1 561; CI-NEXT: v_or_b32_e32 v2, v2, v4 562; CI-NEXT: v_add_f32_e32 v2, v1, v2 563; CI-NEXT: v_mov_b32_e32 v1, 0 564; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 565; CI-NEXT: s_endpgm 566; 567; GFX9-ALIGNED-LABEL: unaligned_read2_f32: 568; GFX9-ALIGNED: ; %bb.0: 569; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 570; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c 571; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 572; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 573; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 574; GFX9-ALIGNED-NEXT: ds_read_u8 v2, v1 575; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v1 offset:1 576; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v1 offset:2 577; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v1 offset:3 578; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v1 offset:32 579; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v1 offset:33 580; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v1 offset:34 581; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v1 offset:35 582; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(6) 583; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 8, v2 584; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(4) 585; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v4 586; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 587; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(2) 588; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v6 589; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 590; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v8 591; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v3 592; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 593; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[2:3] 594; GFX9-ALIGNED-NEXT: s_endpgm 595; 596; GFX9-UNALIGNED-LABEL: unaligned_read2_f32: 597; GFX9-UNALIGNED: ; %bb.0: 598; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 599; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c 600; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 601; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 602; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s4, v2 603; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 604; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 605; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 606; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[2:3] 607; GFX9-UNALIGNED-NEXT: s_endpgm 608 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 609 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i 610 %val0 = load float, float addrspace(3)* %arrayidx0, align 1 611 %add.x = add nsw i32 %x.i, 8 612 %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x 613 %val1 = load float, float addrspace(3)* %arrayidx1, align 1 614 %sum = fadd float %val0, %val1 615 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 616 store float %sum, float addrspace(1)* %out.gep, align 4 617 ret void 618} 619 620define amdgpu_kernel void @unaligned_offset_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { 621; CI-LABEL: unaligned_offset_read2_f32: 622; CI: ; %bb.0: 623; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 624; CI-NEXT: s_load_dword s0, s[0:1], 0xb 625; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 626; CI-NEXT: s_mov_b32 m0, -1 627; CI-NEXT: s_mov_b32 s7, 0xf000 628; CI-NEXT: s_mov_b32 s6, 0 629; CI-NEXT: s_waitcnt lgkmcnt(0) 630; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 631; CI-NEXT: ds_read_u8 v2, v1 offset:12 632; CI-NEXT: ds_read_u8 v3, v1 offset:11 633; CI-NEXT: ds_read_u8 v4, v1 offset:10 634; CI-NEXT: ds_read_u8 v5, v1 offset:9 635; CI-NEXT: ds_read_u8 v6, v1 offset:8 636; CI-NEXT: ds_read_u8 v7, v1 offset:7 637; CI-NEXT: ds_read_u8 v8, v1 offset:6 638; CI-NEXT: ds_read_u8 v1, v1 offset:5 639; CI-NEXT: s_waitcnt lgkmcnt(7) 640; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 641; CI-NEXT: s_waitcnt lgkmcnt(3) 642; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 643; CI-NEXT: s_waitcnt lgkmcnt(2) 644; CI-NEXT: v_or_b32_e32 v6, v6, v7 645; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 646; CI-NEXT: v_or_b32_e32 v2, v2, v3 647; CI-NEXT: s_waitcnt lgkmcnt(1) 648; CI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 649; CI-NEXT: s_waitcnt lgkmcnt(0) 650; CI-NEXT: v_or_b32_e32 v1, v8, v1 651; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 652; CI-NEXT: v_or_b32_e32 v4, v4, v5 653; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 654; CI-NEXT: v_or_b32_e32 v1, v6, v1 655; CI-NEXT: v_or_b32_e32 v2, v2, v4 656; CI-NEXT: v_add_f32_e32 v2, v1, v2 657; CI-NEXT: v_mov_b32_e32 v1, 0 658; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 659; CI-NEXT: s_endpgm 660; 661; GFX9-ALIGNED-LABEL: unaligned_offset_read2_f32: 662; GFX9-ALIGNED: ; %bb.0: 663; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 664; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c 665; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 666; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 667; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 668; GFX9-ALIGNED-NEXT: ds_read_u8 v2, v1 offset:5 669; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v1 offset:6 670; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v1 offset:7 671; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v1 offset:8 672; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v1 offset:9 673; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v1 offset:10 674; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v1 offset:11 675; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v1 offset:12 676; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(6) 677; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 8, v2 678; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(4) 679; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v4 680; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 681; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(2) 682; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v6 683; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 684; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v8 685; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v3 686; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 687; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[2:3] 688; GFX9-ALIGNED-NEXT: s_endpgm 689; 690; GFX9-UNALIGNED-LABEL: unaligned_offset_read2_f32: 691; GFX9-UNALIGNED: ; %bb.0: 692; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 693; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c 694; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 695; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 696; GFX9-UNALIGNED-NEXT: v_add3_u32 v0, s4, v2, 5 697; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 698; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 699; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 700; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[2:3] 701; GFX9-UNALIGNED-NEXT: s_endpgm 702 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 703 %base = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i 704 %base.i8 = bitcast float addrspace(3)* %base to i8 addrspace(3)* 705 %addr0.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 5 706 %addr0 = bitcast i8 addrspace(3)* %addr0.i8 to float addrspace(3)* 707 %val0 = load float, float addrspace(3)* %addr0, align 1 708 %addr1.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 9 709 %addr1 = bitcast i8 addrspace(3)* %addr1.i8 to float addrspace(3)* 710 %val1 = load float, float addrspace(3)* %addr1, align 1 711 %sum = fadd float %val0, %val1 712 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 713 store float %sum, float addrspace(1)* %out.gep, align 4 714 ret void 715} 716 717define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { 718; CI-LABEL: misaligned_2_simple_read2_f32: 719; CI: ; %bb.0: 720; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 721; CI-NEXT: s_load_dword s0, s[0:1], 0xb 722; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 723; CI-NEXT: s_mov_b32 m0, -1 724; CI-NEXT: s_mov_b32 s7, 0xf000 725; CI-NEXT: s_mov_b32 s6, 0 726; CI-NEXT: s_waitcnt lgkmcnt(0) 727; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 728; CI-NEXT: ds_read_u16 v2, v1 offset:34 729; CI-NEXT: ds_read_u16 v3, v1 offset:32 730; CI-NEXT: ds_read_u16 v4, v1 offset:2 731; CI-NEXT: ds_read_u16 v1, v1 732; CI-NEXT: s_waitcnt lgkmcnt(3) 733; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 734; CI-NEXT: s_waitcnt lgkmcnt(2) 735; CI-NEXT: v_or_b32_e32 v2, v2, v3 736; CI-NEXT: s_waitcnt lgkmcnt(1) 737; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 738; CI-NEXT: s_waitcnt lgkmcnt(0) 739; CI-NEXT: v_or_b32_e32 v1, v4, v1 740; CI-NEXT: v_add_f32_e32 v2, v1, v2 741; CI-NEXT: v_mov_b32_e32 v1, 0 742; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 743; CI-NEXT: s_endpgm 744; 745; GFX9-ALIGNED-LABEL: misaligned_2_simple_read2_f32: 746; GFX9-ALIGNED: ; %bb.0: 747; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 748; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c 749; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 750; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 751; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 752; GFX9-ALIGNED-NEXT: ds_read_u16 v2, v1 753; GFX9-ALIGNED-NEXT: ds_read_u16 v3, v1 offset:2 754; GFX9-ALIGNED-NEXT: ds_read_u16 v4, v1 offset:32 755; GFX9-ALIGNED-NEXT: ds_read_u16 v1, v1 offset:34 756; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(2) 757; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 758; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 759; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v4 760; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 761; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[2:3] 762; GFX9-ALIGNED-NEXT: s_endpgm 763; 764; GFX9-UNALIGNED-LABEL: misaligned_2_simple_read2_f32: 765; GFX9-UNALIGNED: ; %bb.0: 766; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 767; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c 768; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 769; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 770; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s4, v2 771; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 772; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 773; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 774; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[2:3] 775; GFX9-UNALIGNED-NEXT: s_endpgm 776 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 777 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i 778 %val0 = load float, float addrspace(3)* %arrayidx0, align 2 779 %add.x = add nsw i32 %x.i, 8 780 %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x 781 %val1 = load float, float addrspace(3)* %arrayidx1, align 2 782 %sum = fadd float %val0, %val1 783 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 784 store float %sum, float addrspace(1)* %out.gep, align 4 785 ret void 786} 787 788define amdgpu_kernel void @simple_read2_f64(double addrspace(1)* %out) #0 { 789; CI-LABEL: simple_read2_f64: 790; CI: ; %bb.0: 791; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 792; CI-NEXT: s_mov_b32 m0, -1 793; CI-NEXT: ds_read2_b64 v[0:3], v4 offset1:8 794; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 795; CI-NEXT: s_mov_b32 s3, 0xf000 796; CI-NEXT: s_mov_b32 s2, 0 797; CI-NEXT: v_mov_b32_e32 v5, 0 798; CI-NEXT: s_waitcnt lgkmcnt(0) 799; CI-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 800; CI-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64 801; CI-NEXT: s_endpgm 802; 803; GFX9-LABEL: simple_read2_f64: 804; GFX9: ; %bb.0: 805; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 806; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:8 807; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 808; GFX9-NEXT: s_waitcnt lgkmcnt(0) 809; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 810; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 811; GFX9-NEXT: s_endpgm 812 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 813 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i 814 %val0 = load double, double addrspace(3)* %arrayidx0, align 8 815 %add.x = add nsw i32 %x.i, 8 816 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x 817 %val1 = load double, double addrspace(3)* %arrayidx1, align 8 818 %sum = fadd double %val0, %val1 819 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i 820 store double %sum, double addrspace(1)* %out.gep, align 8 821 ret void 822} 823 824define amdgpu_kernel void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 { 825; CI-LABEL: simple_read2_f64_max_offset: 826; CI: ; %bb.0: 827; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 828; CI-NEXT: s_mov_b32 m0, -1 829; CI-NEXT: ds_read2_b64 v[0:3], v4 offset1:255 830; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 831; CI-NEXT: s_mov_b32 s3, 0xf000 832; CI-NEXT: s_mov_b32 s2, 0 833; CI-NEXT: v_mov_b32_e32 v5, 0 834; CI-NEXT: s_waitcnt lgkmcnt(0) 835; CI-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 836; CI-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64 837; CI-NEXT: s_endpgm 838; 839; GFX9-LABEL: simple_read2_f64_max_offset: 840; GFX9: ; %bb.0: 841; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 842; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:255 843; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 844; GFX9-NEXT: s_waitcnt lgkmcnt(0) 845; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 846; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 847; GFX9-NEXT: s_endpgm 848 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 849 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i 850 %val0 = load double, double addrspace(3)* %arrayidx0, align 8 851 %add.x = add nsw i32 %x.i, 255 852 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x 853 %val1 = load double, double addrspace(3)* %arrayidx1, align 8 854 %sum = fadd double %val0, %val1 855 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i 856 store double %sum, double addrspace(1)* %out.gep, align 8 857 ret void 858} 859 860define amdgpu_kernel void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 { 861; CI-LABEL: simple_read2_f64_too_far: 862; CI: ; %bb.0: 863; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 864; CI-NEXT: s_mov_b32 m0, -1 865; CI-NEXT: ds_read_b64 v[1:2], v0 866; CI-NEXT: ds_read_b64 v[3:4], v0 offset:2056 867; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 868; CI-NEXT: s_mov_b32 s3, 0xf000 869; CI-NEXT: s_mov_b32 s2, 0 870; CI-NEXT: s_waitcnt lgkmcnt(0) 871; CI-NEXT: v_add_f64 v[2:3], v[1:2], v[3:4] 872; CI-NEXT: v_mov_b32_e32 v1, 0 873; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 874; CI-NEXT: s_endpgm 875; 876; GFX9-LABEL: simple_read2_f64_too_far: 877; GFX9: ; %bb.0: 878; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 879; GFX9-NEXT: ds_read_b64 v[0:1], v4 880; GFX9-NEXT: ds_read_b64 v[2:3], v4 offset:2056 881; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 882; GFX9-NEXT: s_waitcnt lgkmcnt(0) 883; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 884; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 885; GFX9-NEXT: s_endpgm 886 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 887 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i 888 %val0 = load double, double addrspace(3)* %arrayidx0, align 8 889 %add.x = add nsw i32 %x.i, 257 890 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x 891 %val1 = load double, double addrspace(3)* %arrayidx1, align 8 892 %sum = fadd double %val0, %val1 893 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i 894 store double %sum, double addrspace(1)* %out.gep, align 8 895 ret void 896} 897 898; Alignment only 4 899define amdgpu_kernel void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { 900; CI-LABEL: misaligned_read2_f64: 901; CI: ; %bb.0: 902; CI-NEXT: s_load_dword s2, s[0:1], 0xb 903; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 904; CI-NEXT: s_mov_b32 m0, -1 905; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 906; CI-NEXT: s_mov_b32 s3, 0xf000 907; CI-NEXT: s_waitcnt lgkmcnt(0) 908; CI-NEXT: v_add_i32_e32 v3, vcc, s2, v0 909; CI-NEXT: ds_read2_b32 v[1:2], v3 offset1:1 910; CI-NEXT: ds_read2_b32 v[3:4], v3 offset0:14 offset1:15 911; CI-NEXT: s_mov_b32 s2, 0 912; CI-NEXT: s_waitcnt lgkmcnt(0) 913; CI-NEXT: v_add_f64 v[2:3], v[1:2], v[3:4] 914; CI-NEXT: v_mov_b32_e32 v1, 0 915; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 916; CI-NEXT: s_endpgm 917; 918; GFX9-LABEL: misaligned_read2_f64: 919; GFX9: ; %bb.0: 920; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 921; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 922; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 923; GFX9-NEXT: s_waitcnt lgkmcnt(0) 924; GFX9-NEXT: v_add_u32_e32 v2, s2, v4 925; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 926; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:14 offset1:15 927; GFX9-NEXT: s_waitcnt lgkmcnt(0) 928; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 929; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 930; GFX9-NEXT: s_endpgm 931 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 932 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i 933 %val0 = load double, double addrspace(3)* %arrayidx0, align 4 934 %add.x = add nsw i32 %x.i, 7 935 %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x 936 %val1 = load double, double addrspace(3)* %arrayidx1, align 4 937 %sum = fadd double %val0, %val1 938 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i 939 store double %sum, double addrspace(1)* %out.gep, align 4 940 ret void 941} 942 943@foo = addrspace(3) global [4 x i32] undef, align 4 944 945define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) { 946; CI-LABEL: load_constant_adjacent_offsets: 947; CI: ; %bb.0: 948; CI-NEXT: v_mov_b32_e32 v0, 0 949; CI-NEXT: s_mov_b32 m0, -1 950; CI-NEXT: ds_read_b64 v[0:1], v0 951; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 952; CI-NEXT: s_mov_b32 s3, 0xf000 953; CI-NEXT: s_mov_b32 s2, -1 954; CI-NEXT: s_waitcnt lgkmcnt(0) 955; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 956; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 957; CI-NEXT: s_endpgm 958; 959; GFX9-LABEL: load_constant_adjacent_offsets: 960; GFX9: ; %bb.0: 961; GFX9-NEXT: v_mov_b32_e32 v2, 0 962; GFX9-NEXT: ds_read_b64 v[0:1], v2 963; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 964; GFX9-NEXT: s_waitcnt lgkmcnt(0) 965; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 966; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 967; GFX9-NEXT: s_endpgm 968 %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 969 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 970 %sum = add i32 %val0, %val1 971 store i32 %sum, i32 addrspace(1)* %out, align 4 972 ret void 973} 974 975define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) { 976; CI-LABEL: load_constant_disjoint_offsets: 977; CI: ; %bb.0: 978; CI-NEXT: v_mov_b32_e32 v0, 0 979; CI-NEXT: s_mov_b32 m0, -1 980; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:2 981; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 982; CI-NEXT: s_mov_b32 s3, 0xf000 983; CI-NEXT: s_mov_b32 s2, -1 984; CI-NEXT: s_waitcnt lgkmcnt(0) 985; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 986; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 987; CI-NEXT: s_endpgm 988; 989; GFX9-LABEL: load_constant_disjoint_offsets: 990; GFX9: ; %bb.0: 991; GFX9-NEXT: v_mov_b32_e32 v2, 0 992; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:2 993; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 994; GFX9-NEXT: s_waitcnt lgkmcnt(0) 995; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 996; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 997; GFX9-NEXT: s_endpgm 998 %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 999 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 1000 %sum = add i32 %val0, %val1 1001 store i32 %sum, i32 addrspace(1)* %out, align 4 1002 ret void 1003} 1004 1005@bar = addrspace(3) global [4 x i64] undef, align 4 1006 1007define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) { 1008; CI-LABEL: load_misaligned64_constant_offsets: 1009; CI: ; %bb.0: 1010; CI-NEXT: v_mov_b32_e32 v0, 0 1011; CI-NEXT: s_mov_b32 m0, -1 1012; CI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 1013; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1014; CI-NEXT: s_mov_b32 s3, 0xf000 1015; CI-NEXT: s_mov_b32 s2, -1 1016; CI-NEXT: s_waitcnt lgkmcnt(0) 1017; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1018; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 1019; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1020; CI-NEXT: s_endpgm 1021; 1022; GFX9-ALIGNED-LABEL: load_misaligned64_constant_offsets: 1023; GFX9-ALIGNED: ; %bb.0: 1024; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v4, 0 1025; GFX9-ALIGNED-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 1026; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1027; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 1028; GFX9-ALIGNED-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 1029; GFX9-ALIGNED-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 1030; GFX9-ALIGNED-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 1031; GFX9-ALIGNED-NEXT: s_endpgm 1032; 1033; GFX9-UNALIGNED-LABEL: load_misaligned64_constant_offsets: 1034; GFX9-UNALIGNED: ; %bb.0: 1035; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, 0 1036; GFX9-UNALIGNED-NEXT: ds_read_b128 v[0:3], v4 1037; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1038; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 1039; GFX9-UNALIGNED-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 1040; GFX9-UNALIGNED-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 1041; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 1042; GFX9-UNALIGNED-NEXT: s_endpgm 1043 %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 1044 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 1045 %sum = add i64 %val0, %val1 1046 store i64 %sum, i64 addrspace(1)* %out, align 8 1047 ret void 1048} 1049 1050@bar.large = addrspace(3) global [4096 x i64] undef, align 4 1051 1052define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) { 1053; CI-LABEL: load_misaligned64_constant_large_offsets: 1054; CI: ; %bb.0: 1055; CI-NEXT: v_mov_b32_e32 v2, 0 1056; CI-NEXT: s_mov_b32 m0, -1 1057; CI-NEXT: ds_read_b64 v[0:1], v2 offset:16384 1058; CI-NEXT: ds_read_b64 v[2:3], v2 offset:32760 1059; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1060; CI-NEXT: s_mov_b32 s3, 0xf000 1061; CI-NEXT: s_mov_b32 s2, -1 1062; CI-NEXT: s_waitcnt lgkmcnt(0) 1063; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1064; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 1065; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1066; CI-NEXT: s_endpgm 1067; 1068; GFX9-LABEL: load_misaligned64_constant_large_offsets: 1069; GFX9: ; %bb.0: 1070; GFX9-NEXT: v_mov_b32_e32 v4, 0 1071; GFX9-NEXT: ds_read_b64 v[0:1], v4 offset:16384 1072; GFX9-NEXT: ds_read_b64 v[2:3], v4 offset:32760 1073; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1074; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1075; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 1076; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 1077; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 1078; GFX9-NEXT: s_endpgm 1079 %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 1080 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 1081 %sum = add i64 %val0, %val1 1082 store i64 %sum, i64 addrspace(1)* %out, align 8 1083 ret void 1084} 1085 1086@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4 1087@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4 1088 1089define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 { 1090; CI-LABEL: sgemm_inner_loop_read2_sequence: 1091; CI: ; %bb.0: 1092; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1093; CI-NEXT: s_lshl_b32 s0, s2, 2 1094; CI-NEXT: s_add_i32 s1, s0, 0xc20 1095; CI-NEXT: s_addk_i32 s0, 0xc60 1096; CI-NEXT: v_mov_b32_e32 v0, s1 1097; CI-NEXT: v_mov_b32_e32 v2, s0 1098; CI-NEXT: v_lshlrev_b32_e32 v8, 2, v1 1099; CI-NEXT: s_mov_b32 m0, -1 1100; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 1101; CI-NEXT: ds_read2_b32 v[2:3], v2 offset1:1 1102; CI-NEXT: ds_read2_b32 v[4:5], v8 offset1:1 1103; CI-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33 1104; CI-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65 1105; CI-NEXT: s_waitcnt lgkmcnt(0) 1106; CI-NEXT: v_add_f32_e32 v0, v0, v1 1107; CI-NEXT: v_add_f32_e32 v0, v0, v2 1108; CI-NEXT: v_add_f32_e32 v0, v0, v3 1109; CI-NEXT: v_add_f32_e32 v0, v0, v4 1110; CI-NEXT: v_add_f32_e32 v0, v0, v5 1111; CI-NEXT: v_add_f32_e32 v0, v0, v6 1112; CI-NEXT: v_add_f32_e32 v0, v0, v7 1113; CI-NEXT: v_add_f32_e32 v0, v0, v8 1114; CI-NEXT: s_mov_b32 s7, 0xf000 1115; CI-NEXT: s_mov_b32 s6, -1 1116; CI-NEXT: v_add_f32_e32 v0, v0, v9 1117; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1118; CI-NEXT: s_endpgm 1119; 1120; GFX9-LABEL: sgemm_inner_loop_read2_sequence: 1121; GFX9: ; %bb.0: 1122; GFX9-NEXT: s_lshl_b32 s2, s2, 2 1123; GFX9-NEXT: s_add_i32 s3, s2, 0xc20 1124; GFX9-NEXT: s_addk_i32 s2, 0xc60 1125; GFX9-NEXT: v_mov_b32_e32 v0, s3 1126; GFX9-NEXT: v_mov_b32_e32 v2, s2 1127; GFX9-NEXT: v_lshlrev_b32_e32 v8, 2, v1 1128; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 1129; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset1:1 1130; GFX9-NEXT: ds_read2_b32 v[4:5], v8 offset1:1 1131; GFX9-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33 1132; GFX9-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65 1133; GFX9-NEXT: s_waitcnt lgkmcnt(4) 1134; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 1135; GFX9-NEXT: s_waitcnt lgkmcnt(3) 1136; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 1137; GFX9-NEXT: v_add_f32_e32 v0, v0, v3 1138; GFX9-NEXT: s_waitcnt lgkmcnt(2) 1139; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 1140; GFX9-NEXT: v_add_f32_e32 v0, v0, v5 1141; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1142; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1143; GFX9-NEXT: v_add_f32_e32 v0, v0, v6 1144; GFX9-NEXT: v_add_f32_e32 v0, v0, v7 1145; GFX9-NEXT: v_add_f32_e32 v0, v0, v8 1146; GFX9-NEXT: v_mov_b32_e32 v10, 0 1147; GFX9-NEXT: v_add_f32_e32 v0, v0, v9 1148; GFX9-NEXT: global_store_dword v10, v0, s[0:1] 1149; GFX9-NEXT: s_endpgm 1150 %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1 1151 %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1 1152 %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i 1153 %tmp16 = load float, float addrspace(3)* %arrayidx44, align 4 1154 %add47 = add nsw i32 %x.i, 1 1155 %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47 1156 %tmp17 = load float, float addrspace(3)* %arrayidx48, align 4 1157 %add51 = add nsw i32 %x.i, 16 1158 %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51 1159 %tmp18 = load float, float addrspace(3)* %arrayidx52, align 4 1160 %add55 = add nsw i32 %x.i, 17 1161 %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55 1162 %tmp19 = load float, float addrspace(3)* %arrayidx56, align 4 1163 %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i 1164 %tmp20 = load float, float addrspace(3)* %arrayidx60, align 4 1165 %add63 = add nsw i32 %y.i, 1 1166 %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63 1167 %tmp21 = load float, float addrspace(3)* %arrayidx64, align 4 1168 %add67 = add nsw i32 %y.i, 32 1169 %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67 1170 %tmp22 = load float, float addrspace(3)* %arrayidx68, align 4 1171 %add71 = add nsw i32 %y.i, 33 1172 %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71 1173 %tmp23 = load float, float addrspace(3)* %arrayidx72, align 4 1174 %add75 = add nsw i32 %y.i, 64 1175 %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75 1176 %tmp24 = load float, float addrspace(3)* %arrayidx76, align 4 1177 %add79 = add nsw i32 %y.i, 65 1178 %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79 1179 %tmp25 = load float, float addrspace(3)* %arrayidx80, align 4 1180 %sum.0 = fadd float %tmp16, %tmp17 1181 %sum.1 = fadd float %sum.0, %tmp18 1182 %sum.2 = fadd float %sum.1, %tmp19 1183 %sum.3 = fadd float %sum.2, %tmp20 1184 %sum.4 = fadd float %sum.3, %tmp21 1185 %sum.5 = fadd float %sum.4, %tmp22 1186 %sum.6 = fadd float %sum.5, %tmp23 1187 %sum.7 = fadd float %sum.6, %tmp24 1188 %sum.8 = fadd float %sum.7, %tmp25 1189 store float %sum.8, float addrspace(1)* %C, align 4 1190 ret void 1191} 1192 1193define amdgpu_kernel void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 { 1194; CI-LABEL: misaligned_read2_v2i32: 1195; CI: ; %bb.0: 1196; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1197; CI-NEXT: s_load_dword s0, s[0:1], 0xb 1198; CI-NEXT: s_mov_b32 m0, -1 1199; CI-NEXT: s_mov_b32 s7, 0xf000 1200; CI-NEXT: s_mov_b32 s6, -1 1201; CI-NEXT: s_waitcnt lgkmcnt(0) 1202; CI-NEXT: v_mov_b32_e32 v0, s0 1203; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 1204; CI-NEXT: s_waitcnt lgkmcnt(0) 1205; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1206; CI-NEXT: s_endpgm 1207; 1208; GFX9-LABEL: misaligned_read2_v2i32: 1209; GFX9: ; %bb.0: 1210; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1211; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 1212; GFX9-NEXT: v_mov_b32_e32 v2, 0 1213; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1214; GFX9-NEXT: v_mov_b32_e32 v0, s4 1215; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 1216; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1217; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 1218; GFX9-NEXT: s_endpgm 1219 %load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4 1220 store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8 1221 ret void 1222} 1223 1224define amdgpu_kernel void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 { 1225; CI-LABEL: misaligned_read2_i64: 1226; CI: ; %bb.0: 1227; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1228; CI-NEXT: s_load_dword s0, s[0:1], 0xb 1229; CI-NEXT: s_mov_b32 m0, -1 1230; CI-NEXT: s_mov_b32 s7, 0xf000 1231; CI-NEXT: s_mov_b32 s6, -1 1232; CI-NEXT: s_waitcnt lgkmcnt(0) 1233; CI-NEXT: v_mov_b32_e32 v0, s0 1234; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 1235; CI-NEXT: s_waitcnt lgkmcnt(0) 1236; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1237; CI-NEXT: s_endpgm 1238; 1239; GFX9-LABEL: misaligned_read2_i64: 1240; GFX9: ; %bb.0: 1241; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1242; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 1243; GFX9-NEXT: v_mov_b32_e32 v2, 0 1244; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1245; GFX9-NEXT: v_mov_b32_e32 v0, s4 1246; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 1247; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1248; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 1249; GFX9-NEXT: s_endpgm 1250 %load = load i64, i64 addrspace(3)* %in, align 4 1251 store i64 %load, i64 addrspace(1)* %out, align 8 1252 ret void 1253} 1254 1255define amdgpu_kernel void @ds_read_diff_base_interleaving( 1256; CI-LABEL: ds_read_diff_base_interleaving: 1257; CI: ; %bb.0: ; %bb 1258; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1259; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 1260; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v1 1261; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1262; CI-NEXT: s_mov_b32 m0, -1 1263; CI-NEXT: s_mov_b32 s7, 0xf000 1264; CI-NEXT: s_waitcnt lgkmcnt(0) 1265; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v1 1266; CI-NEXT: v_add_i32_e32 v4, vcc, s1, v0 1267; CI-NEXT: v_add_i32_e32 v3, vcc, s2, v1 1268; CI-NEXT: v_add_i32_e32 v6, vcc, s3, v0 1269; CI-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 1270; CI-NEXT: ds_read2_b32 v[2:3], v3 offset1:1 1271; CI-NEXT: ds_read2_b32 v[4:5], v4 offset1:4 1272; CI-NEXT: s_mov_b32 s6, -1 1273; CI-NEXT: s_waitcnt lgkmcnt(0) 1274; CI-NEXT: v_mul_f32_e32 v0, v0, v4 1275; CI-NEXT: v_add_f32_e32 v4, 2.0, v0 1276; CI-NEXT: v_mul_f32_e32 v5, v1, v5 1277; CI-NEXT: ds_read2_b32 v[0:1], v6 offset1:4 1278; CI-NEXT: s_waitcnt lgkmcnt(0) 1279; CI-NEXT: v_mul_f32_e32 v0, v2, v0 1280; CI-NEXT: v_sub_f32_e32 v0, v4, v0 1281; CI-NEXT: v_sub_f32_e32 v0, v0, v5 1282; CI-NEXT: v_mul_f32_e32 v1, v3, v1 1283; CI-NEXT: v_sub_f32_e32 v0, v0, v1 1284; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:40 1285; CI-NEXT: s_endpgm 1286; 1287; GFX9-LABEL: ds_read_diff_base_interleaving: 1288; GFX9: ; %bb.0: ; %bb 1289; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1290; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 1291; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 1292; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1293; GFX9-NEXT: v_mov_b32_e32 v8, 0 1294; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1295; GFX9-NEXT: v_add_u32_e32 v2, s4, v1 1296; GFX9-NEXT: v_add_u32_e32 v3, s5, v0 1297; GFX9-NEXT: v_add_u32_e32 v4, s6, v1 1298; GFX9-NEXT: v_add_u32_e32 v6, s7, v0 1299; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 1300; GFX9-NEXT: ds_read2_b32 v[2:3], v3 offset1:4 1301; GFX9-NEXT: ds_read2_b32 v[4:5], v4 offset1:1 1302; GFX9-NEXT: ds_read2_b32 v[6:7], v6 offset1:4 1303; GFX9-NEXT: s_waitcnt lgkmcnt(2) 1304; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 1305; GFX9-NEXT: v_add_f32_e32 v0, 2.0, v0 1306; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1307; GFX9-NEXT: v_mul_f32_e32 v2, v4, v6 1308; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 1309; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 1310; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 1311; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7 1312; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 1313; GFX9-NEXT: global_store_dword v8, v0, s[2:3] offset:40 1314; GFX9-NEXT: s_endpgm 1315 float addrspace(1)* nocapture %arg, 1316 [4 x [4 x float]] addrspace(3)* %arg1, 1317 [4 x [4 x float]] addrspace(3)* %arg2, 1318 [4 x [4 x float]] addrspace(3)* %arg3, 1319 [4 x [4 x float]] addrspace(3)* %arg4) #1 { 1320bb: 1321 %tmp = getelementptr float, float addrspace(1)* %arg, i64 10 1322 %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.x() #2 1323 %tmp6 = tail call i32 @llvm.amdgcn.workitem.id.y() #2 1324 %tmp7 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg1, i32 0, i32 %tmp6, i32 0 1325 %tmp8 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg2, i32 0, i32 0, i32 %tmp5 1326 %tmp9 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg3, i32 0, i32 %tmp6, i32 0 1327 %tmp10 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg4, i32 0, i32 0, i32 %tmp5 1328 %tmp11 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg1, i32 0, i32 %tmp6, i32 1 1329 %tmp12 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg2, i32 0, i32 1, i32 %tmp5 1330 %tmp13 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg3, i32 0, i32 %tmp6, i32 1 1331 %tmp14 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg4, i32 0, i32 1, i32 %tmp5 1332 %tmp15 = load float, float addrspace(3)* %tmp7 1333 %tmp16 = load float, float addrspace(3)* %tmp8 1334 %tmp17 = fmul float %tmp15, %tmp16 1335 %tmp18 = fadd float 2.000000e+00, %tmp17 1336 %tmp19 = load float, float addrspace(3)* %tmp9 1337 %tmp20 = load float, float addrspace(3)* %tmp10 1338 %tmp21 = fmul float %tmp19, %tmp20 1339 %tmp22 = fsub float %tmp18, %tmp21 1340 %tmp23 = load float, float addrspace(3)* %tmp11 1341 %tmp24 = load float, float addrspace(3)* %tmp12 1342 %tmp25 = fmul float %tmp23, %tmp24 1343 %tmp26 = fsub float %tmp22, %tmp25 1344 %tmp27 = load float, float addrspace(3)* %tmp13 1345 %tmp28 = load float, float addrspace(3)* %tmp14 1346 %tmp29 = fmul float %tmp27, %tmp28 1347 %tmp30 = fsub float %tmp26, %tmp29 1348 store float %tmp30, float addrspace(1)* %tmp 1349 ret void 1350} 1351 1352define amdgpu_kernel void @ds_read_call_read(i32 addrspace(1)* %out, i32 addrspace(3)* %arg) { 1353; CI-LABEL: ds_read_call_read: 1354; CI: ; %bb.0: 1355; CI-NEXT: s_getpc_b64 s[40:41] 1356; CI-NEXT: s_mov_b32 s40, s0 1357; CI-NEXT: s_load_dwordx4 s[40:43], s[40:41], 0x0 1358; CI-NEXT: s_load_dwordx2 s[36:37], s[0:1], 0x9 1359; CI-NEXT: s_load_dword s0, s[0:1], 0xb 1360; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1361; CI-NEXT: s_mov_b32 m0, -1 1362; CI-NEXT: s_mov_b32 s32, 0 1363; CI-NEXT: s_waitcnt lgkmcnt(0) 1364; CI-NEXT: s_add_u32 s40, s40, s3 1365; CI-NEXT: s_addc_u32 s41, s41, 0 1366; CI-NEXT: v_add_i32_e32 v40, vcc, s0, v0 1367; CI-NEXT: s_getpc_b64 s[0:1] 1368; CI-NEXT: s_add_u32 s0, s0, void_func_void@gotpcrel32@lo+4 1369; CI-NEXT: s_addc_u32 s1, s1, void_func_void@gotpcrel32@hi+12 1370; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 1371; CI-NEXT: ds_read_b32 v41, v40 1372; CI-NEXT: s_mov_b64 s[0:1], s[40:41] 1373; CI-NEXT: s_mov_b64 s[2:3], s[42:43] 1374; CI-NEXT: s_mov_b32 s39, 0xf000 1375; CI-NEXT: s_mov_b32 s38, -1 1376; CI-NEXT: s_waitcnt lgkmcnt(0) 1377; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] 1378; CI-NEXT: ds_read_b32 v0, v40 offset:4 1379; CI-NEXT: s_waitcnt lgkmcnt(0) 1380; CI-NEXT: v_add_i32_e32 v0, vcc, v41, v0 1381; CI-NEXT: buffer_store_dword v0, off, s[36:39], 0 1382; CI-NEXT: s_endpgm 1383; 1384; GFX9-LABEL: ds_read_call_read: 1385; GFX9: ; %bb.0: 1386; GFX9-NEXT: s_getpc_b64 s[36:37] 1387; GFX9-NEXT: s_mov_b32 s36, s0 1388; GFX9-NEXT: s_load_dwordx4 s[36:39], s[36:37], 0x0 1389; GFX9-NEXT: s_nop 0 1390; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 1391; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 1392; GFX9-NEXT: s_mov_b32 s32, 0 1393; GFX9-NEXT: v_mov_b32_e32 v40, 0 1394; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1395; GFX9-NEXT: s_add_u32 s36, s36, s3 1396; GFX9-NEXT: s_addc_u32 s37, s37, 0 1397; GFX9-NEXT: s_getpc_b64 s[0:1] 1398; GFX9-NEXT: s_add_u32 s0, s0, void_func_void@gotpcrel32@lo+4 1399; GFX9-NEXT: s_addc_u32 s1, s1, void_func_void@gotpcrel32@hi+12 1400; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s2 1401; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 1402; GFX9-NEXT: ds_read_b32 v42, v41 1403; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] 1404; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] 1405; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1406; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] 1407; GFX9-NEXT: ds_read_b32 v0, v41 offset:4 1408; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1409; GFX9-NEXT: v_add_u32_e32 v0, v42, v0 1410; GFX9-NEXT: global_store_dword v40, v0, s[34:35] 1411; GFX9-NEXT: s_endpgm 1412 %x = call i32 @llvm.amdgcn.workitem.id.x() 1413 %arrayidx0 = getelementptr i32, i32 addrspace(3)* %arg, i32 %x 1414 %arrayidx1 = getelementptr i32, i32 addrspace(3)* %arrayidx0, i32 1 1415 %v0 = load i32, i32 addrspace(3)* %arrayidx0, align 4 1416 call void @void_func_void() 1417 %v1 = load i32, i32 addrspace(3)* %arrayidx1, align 4 1418 %r = add i32 %v0, %v1 1419 store i32 %r, i32 addrspace(1)* %out, align 4 1420 ret void 1421} 1422 1423define amdgpu_ps <2 x float> @ds_read_interp_read(i32 inreg %prims, float addrspace(3)* %inptr) { 1424; CI-LABEL: ds_read_interp_read: 1425; CI: ; %bb.0: 1426; CI-NEXT: s_mov_b32 m0, -1 1427; CI-NEXT: ds_read_b32 v2, v0 1428; CI-NEXT: s_mov_b32 m0, s0 1429; CI-NEXT: v_interp_mov_f32 v1, p10, attr0.x 1430; CI-NEXT: s_mov_b32 m0, -1 1431; CI-NEXT: ds_read_b32 v0, v0 offset:16 1432; CI-NEXT: s_waitcnt lgkmcnt(0) 1433; CI-NEXT: v_add_f32_e32 v1, v0, v1 1434; CI-NEXT: v_mov_b32_e32 v0, v2 1435; CI-NEXT: ; return to shader part epilog 1436; 1437; GFX9-LABEL: ds_read_interp_read: 1438; GFX9: ; %bb.0: 1439; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:4 1440; GFX9-NEXT: s_mov_b32 m0, s0 1441; GFX9-NEXT: s_nop 0 1442; GFX9-NEXT: v_interp_mov_f32_e32 v2, p10, attr0.x 1443; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1444; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 1445; GFX9-NEXT: ; return to shader part epilog 1446 %v0 = load float, float addrspace(3)* %inptr, align 4 1447 %intrp = call float @llvm.amdgcn.interp.mov(i32 0, i32 0, i32 0, i32 %prims) 1448 %ptr1 = getelementptr float, float addrspace(3)* %inptr, i32 4 1449 %v1 = load float, float addrspace(3)* %ptr1, align 4 1450 %v1b = fadd float %v1, %intrp 1451 %r0 = insertelement <2 x float> undef, float %v0, i32 0 1452 %r1 = insertelement <2 x float> %r0, float %v1b, i32 1 1453 ret <2 x float> %r1 1454} 1455 1456@v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1 1457 1458define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)* %out) { 1459; CI-LABEL: read2_v2i32_align1_odd_offset: 1460; CI: ; %bb.0: ; %entry 1461; CI-NEXT: v_mov_b32_e32 v0, 0 1462; CI-NEXT: s_mov_b32 m0, -1 1463; CI-NEXT: ds_read_u8 v1, v0 offset:72 1464; CI-NEXT: ds_read_u8 v2, v0 offset:71 1465; CI-NEXT: ds_read_u8 v3, v0 offset:70 1466; CI-NEXT: ds_read_u8 v4, v0 offset:69 1467; CI-NEXT: ds_read_u8 v5, v0 offset:68 1468; CI-NEXT: s_waitcnt lgkmcnt(4) 1469; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1470; CI-NEXT: s_waitcnt lgkmcnt(3) 1471; CI-NEXT: v_or_b32_e32 v1, v1, v2 1472; CI-NEXT: s_waitcnt lgkmcnt(2) 1473; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 1474; CI-NEXT: s_waitcnt lgkmcnt(1) 1475; CI-NEXT: v_or_b32_e32 v3, v3, v4 1476; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1477; CI-NEXT: v_or_b32_e32 v1, v1, v3 1478; CI-NEXT: ds_read_u8 v2, v0 offset:67 1479; CI-NEXT: ds_read_u8 v3, v0 offset:66 1480; CI-NEXT: ds_read_u8 v0, v0 offset:65 1481; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1482; CI-NEXT: s_mov_b32 s3, 0xf000 1483; CI-NEXT: s_mov_b32 s2, -1 1484; CI-NEXT: s_waitcnt lgkmcnt(0) 1485; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 1486; CI-NEXT: v_or_b32_e32 v0, v3, v0 1487; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 1488; CI-NEXT: v_or_b32_e32 v2, v3, v2 1489; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1490; CI-NEXT: v_or_b32_e32 v0, v2, v0 1491; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1492; CI-NEXT: s_endpgm 1493; 1494; GFX9-ALIGNED-LABEL: read2_v2i32_align1_odd_offset: 1495; GFX9-ALIGNED: ; %bb.0: ; %entry 1496; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, 0 1497; GFX9-ALIGNED-NEXT: ds_read_u8 v0, v2 offset:65 1498; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v2 offset:66 1499; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v2 offset:67 1500; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v2 offset:68 1501; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v2 offset:69 1502; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v2 offset:70 1503; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v2 offset:71 1504; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v2 offset:72 1505; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1506; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 1507; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v3 1508; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v6, 8, v6 1509; GFX9-ALIGNED-NEXT: v_or_b32_e32 v1, v6, v1 1510; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v6, 8, v8 1511; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v3, v0 1512; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v5 1513; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1514; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1515; GFX9-ALIGNED-NEXT: v_or_b32_e32 v1, v6, v1 1516; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v3, v0 1517; GFX9-ALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1518; GFX9-ALIGNED-NEXT: s_endpgm 1519; 1520; GFX9-UNALIGNED-LABEL: read2_v2i32_align1_odd_offset: 1521; GFX9-UNALIGNED: ; %bb.0: ; %entry 1522; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x41 1523; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1524; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 1525; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0 1526; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 1527; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1528; GFX9-UNALIGNED-NEXT: s_endpgm 1529entry: 1530 %load = load <2 x i32>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1 1531 store <2 x i32> %load, <2 x i32> addrspace(1)* %out 1532 ret void 1533} 1534 1535declare void @void_func_void() #3 1536 1537declare i32 @llvm.amdgcn.workgroup.id.x() #1 1538declare i32 @llvm.amdgcn.workgroup.id.y() #1 1539declare i32 @llvm.amdgcn.workitem.id.x() #1 1540declare i32 @llvm.amdgcn.workitem.id.y() #1 1541 1542declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) nounwind readnone 1543 1544declare void @llvm.amdgcn.s.barrier() #2 1545 1546attributes #0 = { nounwind } 1547attributes #1 = { nounwind readnone speculatable } 1548attributes #2 = { convergent nounwind } 1549attributes #3 = { nounwind noinline } 1550