1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope --check-prefix=CI %s 3; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-ALIGNED %s 4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-UNALIGNED %s 5 6; FIXME: We don't get cases where the address was an SGPR because we 7; get a copy to the address register for each one. 8 9@lds = addrspace(3) global [512 x float] undef, align 4 10@lds.f64 = addrspace(3) global [512 x double] undef, align 8 11 12define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 { 13; CI-LABEL: simple_read2_f32: 14; CI: ; %bb.0: 15; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 16; CI-NEXT: s_mov_b32 m0, -1 17; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 18; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 19; CI-NEXT: s_mov_b32 s3, 0xf000 20; CI-NEXT: s_mov_b32 s2, 0 21; CI-NEXT: s_waitcnt lgkmcnt(0) 22; CI-NEXT: v_add_f32_e32 v2, v1, v2 23; CI-NEXT: v_mov_b32_e32 v1, 0 24; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 25; CI-NEXT: s_endpgm 26; 27; GFX9-LABEL: simple_read2_f32: 28; GFX9: ; %bb.0: 29; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 30; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8 31; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 32; GFX9-NEXT: s_waitcnt lgkmcnt(0) 33; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 34; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 35; GFX9-NEXT: s_endpgm 36 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 37 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 38 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 39 %add.x = add nsw i32 %x.i, 8 40 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 41 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 42 %sum = fadd float %val0, %val1 43 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 44 store float %sum, float addrspace(1)* %out.gep, align 4 45 ret void 46} 47 48define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 { 49; CI-LABEL: simple_read2_f32_max_offset: 50; CI: ; %bb.0: 51; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 52; CI-NEXT: s_mov_b32 m0, -1 53; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:255 54; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 55; CI-NEXT: s_mov_b32 s3, 0xf000 56; CI-NEXT: s_mov_b32 s2, 0 57; CI-NEXT: s_waitcnt lgkmcnt(0) 58; CI-NEXT: v_add_f32_e32 v2, v1, v2 59; CI-NEXT: v_mov_b32_e32 v1, 0 60; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 61; CI-NEXT: s_endpgm 62; 63; GFX9-LABEL: simple_read2_f32_max_offset: 64; GFX9: ; %bb.0: 65; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 66; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:255 67; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 68; GFX9-NEXT: s_waitcnt lgkmcnt(0) 69; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 70; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 71; GFX9-NEXT: s_endpgm 72 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 73 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 74 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 75 %add.x = add nsw i32 %x.i, 255 76 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 77 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 78 %sum = fadd float %val0, %val1 79 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 80 store float %sum, float addrspace(1)* %out.gep, align 4 81 ret void 82} 83 84define amdgpu_kernel void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 { 85; CI-LABEL: simple_read2_f32_too_far: 86; CI: ; %bb.0: 87; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 88; CI-NEXT: s_mov_b32 m0, -1 89; CI-NEXT: ds_read_b32 v1, v0 90; CI-NEXT: ds_read_b32 v2, v0 offset:1028 91; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 92; CI-NEXT: s_mov_b32 s3, 0xf000 93; CI-NEXT: s_mov_b32 s2, 0 94; CI-NEXT: s_waitcnt lgkmcnt(0) 95; CI-NEXT: v_add_f32_e32 v2, v1, v2 96; CI-NEXT: v_mov_b32_e32 v1, 0 97; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 98; CI-NEXT: s_endpgm 99; 100; GFX9-LABEL: simple_read2_f32_too_far: 101; GFX9: ; %bb.0: 102; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 103; GFX9-NEXT: ds_read_b32 v1, v0 104; GFX9-NEXT: ds_read_b32 v2, v0 offset:1028 105; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 106; GFX9-NEXT: s_waitcnt lgkmcnt(0) 107; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 108; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 109; GFX9-NEXT: s_endpgm 110 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 111 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 112 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 113 %add.x = add nsw i32 %x.i, 257 114 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 115 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 116 %sum = fadd float %val0, %val1 117 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 118 store float %sum, float addrspace(1)* %out.gep, align 4 119 ret void 120} 121 122define amdgpu_kernel void @simple_read2_f32_x2(float addrspace(1)* %out) #0 { 123; CI-LABEL: simple_read2_f32_x2: 124; CI: ; %bb.0: 125; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 126; CI-NEXT: s_mov_b32 m0, -1 127; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 128; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 129; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 130; CI-NEXT: s_mov_b32 s3, 0xf000 131; CI-NEXT: s_mov_b32 s2, 0 132; CI-NEXT: s_waitcnt lgkmcnt(0) 133; CI-NEXT: v_add_f32_e32 v1, v1, v2 134; CI-NEXT: v_add_f32_e32 v2, v3, v4 135; CI-NEXT: v_add_f32_e32 v2, v1, v2 136; CI-NEXT: v_mov_b32_e32 v1, 0 137; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 138; CI-NEXT: s_endpgm 139; 140; GFX9-LABEL: simple_read2_f32_x2: 141; GFX9: ; %bb.0: 142; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 143; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset1:8 144; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 145; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 146; GFX9-NEXT: s_waitcnt lgkmcnt(0) 147; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 148; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 149; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 150; GFX9-NEXT: global_store_dword v4, v0, s[0:1] 151; GFX9-NEXT: s_endpgm 152 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 153 %idx.0 = add nsw i32 %tid.x, 0 154 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 155 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 156 157 %idx.1 = add nsw i32 %tid.x, 8 158 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 159 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 160 %sum.0 = fadd float %val0, %val1 161 162 %idx.2 = add nsw i32 %tid.x, 11 163 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 164 %val2 = load float, float addrspace(3)* %arrayidx2, align 4 165 166 %idx.3 = add nsw i32 %tid.x, 27 167 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 168 %val3 = load float, float addrspace(3)* %arrayidx3, align 4 169 %sum.1 = fadd float %val2, %val3 170 171 %sum = fadd float %sum.0, %sum.1 172 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0 173 store float %sum, float addrspace(1)* %out.gep, align 4 174 ret void 175} 176 177; Make sure there is an instruction between the two sets of reads. 178define amdgpu_kernel void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 { 179; CI-LABEL: simple_read2_f32_x2_barrier: 180; CI: ; %bb.0: 181; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 182; CI-NEXT: s_mov_b32 m0, -1 183; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 184; CI-NEXT: s_waitcnt lgkmcnt(0) 185; CI-NEXT: s_barrier 186; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 187; CI-NEXT: s_mov_b32 s3, 0xf000 188; CI-NEXT: s_mov_b32 s2, 0 189; CI-NEXT: v_add_f32_e32 v3, v1, v2 190; CI-NEXT: ds_read2_b32 v[1:2], v0 offset0:11 offset1:27 191; CI-NEXT: s_waitcnt lgkmcnt(0) 192; CI-NEXT: v_add_f32_e32 v1, v1, v2 193; CI-NEXT: v_add_f32_e32 v2, v3, v1 194; CI-NEXT: v_mov_b32_e32 v1, 0 195; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 196; CI-NEXT: s_endpgm 197; 198; GFX9-LABEL: simple_read2_f32_x2_barrier: 199; GFX9: ; %bb.0: 200; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 201; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset1:8 202; GFX9-NEXT: s_waitcnt lgkmcnt(0) 203; GFX9-NEXT: s_barrier 204; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 205; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 206; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 207; GFX9-NEXT: s_waitcnt lgkmcnt(0) 208; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 209; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 210; GFX9-NEXT: global_store_dword v4, v0, s[0:1] 211; GFX9-NEXT: s_endpgm 212 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 213 %idx.0 = add nsw i32 %tid.x, 0 214 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 215 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 216 217 %idx.1 = add nsw i32 %tid.x, 8 218 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 219 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 220 %sum.0 = fadd float %val0, %val1 221 222 call void @llvm.amdgcn.s.barrier() #2 223 224 %idx.2 = add nsw i32 %tid.x, 11 225 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 226 %val2 = load float, float addrspace(3)* %arrayidx2, align 4 227 228 %idx.3 = add nsw i32 %tid.x, 27 229 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 230 %val3 = load float, float addrspace(3)* %arrayidx3, align 4 231 %sum.1 = fadd float %val2, %val3 232 233 %sum = fadd float %sum.0, %sum.1 234 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0 235 store float %sum, float addrspace(1)* %out.gep, align 4 236 ret void 237} 238 239; For some reason adding something to the base address for the first 240; element results in only folding the inner pair. 241define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 { 242; CI-LABEL: simple_read2_f32_x2_nonzero_base: 243; CI: ; %bb.0: 244; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 245; CI-NEXT: s_mov_b32 m0, -1 246; CI-NEXT: ds_read2_b32 v[1:2], v0 offset0:2 offset1:8 247; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 248; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 249; CI-NEXT: s_mov_b32 s3, 0xf000 250; CI-NEXT: s_mov_b32 s2, 0 251; CI-NEXT: s_waitcnt lgkmcnt(0) 252; CI-NEXT: v_add_f32_e32 v1, v1, v2 253; CI-NEXT: v_add_f32_e32 v2, v3, v4 254; CI-NEXT: v_add_f32_e32 v2, v1, v2 255; CI-NEXT: v_mov_b32_e32 v1, 0 256; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:8 257; CI-NEXT: s_endpgm 258; 259; GFX9-LABEL: simple_read2_f32_x2_nonzero_base: 260; GFX9: ; %bb.0: 261; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 262; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset0:2 offset1:8 263; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 264; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 265; GFX9-NEXT: s_waitcnt lgkmcnt(0) 266; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 267; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 268; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 269; GFX9-NEXT: global_store_dword v4, v0, s[0:1] offset:8 270; GFX9-NEXT: s_endpgm 271 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 272 %idx.0 = add nsw i32 %tid.x, 2 273 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 274 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 275 276 %idx.1 = add nsw i32 %tid.x, 8 277 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 278 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 279 %sum.0 = fadd float %val0, %val1 280 281 %idx.2 = add nsw i32 %tid.x, 11 282 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 283 %val2 = load float, float addrspace(3)* %arrayidx2, align 4 284 285 %idx.3 = add nsw i32 %tid.x, 27 286 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 287 %val3 = load float, float addrspace(3)* %arrayidx3, align 4 288 %sum.1 = fadd float %val2, %val3 289 290 %sum = fadd float %sum.0, %sum.1 291 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0 292 store float %sum, float addrspace(1)* %out.gep, align 4 293 ret void 294} 295 296; Be careful of vectors of pointers. We don't know if the 2 pointers 297; in the vectors are really the same base, so this is not safe to 298; merge. 299; Base pointers come from different subregister of same super 300; register. We can't safely merge this. 301define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { 302; CI-LABEL: read2_ptr_is_subreg_arg_f32: 303; CI: ; %bb.0: 304; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 305; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 306; CI-NEXT: s_mov_b32 m0, -1 307; CI-NEXT: s_mov_b32 s7, 0xf000 308; CI-NEXT: s_mov_b32 s6, 0 309; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 310; CI-NEXT: s_waitcnt lgkmcnt(0) 311; CI-NEXT: v_mov_b32_e32 v1, s0 312; CI-NEXT: v_mov_b32_e32 v2, s1 313; CI-NEXT: ds_read_b32 v1, v1 offset:32 314; CI-NEXT: ds_read_b32 v2, v2 315; CI-NEXT: s_waitcnt lgkmcnt(0) 316; CI-NEXT: v_add_f32_e32 v2, v1, v2 317; CI-NEXT: v_mov_b32_e32 v1, 0 318; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 319; CI-NEXT: s_endpgm 320; 321; GFX9-LABEL: read2_ptr_is_subreg_arg_f32: 322; GFX9: ; %bb.0: 323; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 324; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 325; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 326; GFX9-NEXT: s_waitcnt lgkmcnt(0) 327; GFX9-NEXT: v_mov_b32_e32 v1, s4 328; GFX9-NEXT: v_mov_b32_e32 v2, s5 329; GFX9-NEXT: ds_read_b32 v1, v1 offset:32 330; GFX9-NEXT: ds_read_b32 v2, v2 331; GFX9-NEXT: s_waitcnt lgkmcnt(0) 332; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 333; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 334; GFX9-NEXT: s_endpgm 335 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 336 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 337 %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 338 %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1 339 %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 340 %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 341 %val0 = load float, float addrspace(3)* %gep.0, align 4 342 %val1 = load float, float addrspace(3)* %gep.1, align 4 343 %add.x = add nsw i32 %x.i, 8 344 %sum = fadd float %val0, %val1 345 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 346 store float %sum, float addrspace(1)* %out.gep, align 4 347 ret void 348} 349 350; Apply a constant scalar offset after the pointer vector extract. We 351; are rejecting merges that have the same, constant 0 offset, so make 352; sure we are really rejecting it because of the different 353; subregisters. 354define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { 355; CI-LABEL: read2_ptr_is_subreg_arg_offset_f32: 356; CI: ; %bb.0: 357; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 358; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 359; CI-NEXT: s_mov_b32 m0, -1 360; CI-NEXT: s_mov_b32 s7, 0xf000 361; CI-NEXT: s_mov_b32 s6, 0 362; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 363; CI-NEXT: s_waitcnt lgkmcnt(0) 364; CI-NEXT: v_mov_b32_e32 v1, s0 365; CI-NEXT: v_mov_b32_e32 v2, s1 366; CI-NEXT: ds_read_b32 v1, v1 offset:32 367; CI-NEXT: ds_read_b32 v2, v2 offset:32 368; CI-NEXT: s_waitcnt lgkmcnt(0) 369; CI-NEXT: v_add_f32_e32 v2, v1, v2 370; CI-NEXT: v_mov_b32_e32 v1, 0 371; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 372; CI-NEXT: s_endpgm 373; 374; GFX9-LABEL: read2_ptr_is_subreg_arg_offset_f32: 375; GFX9: ; %bb.0: 376; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 377; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 378; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 379; GFX9-NEXT: s_waitcnt lgkmcnt(0) 380; GFX9-NEXT: v_mov_b32_e32 v1, s4 381; GFX9-NEXT: v_mov_b32_e32 v2, s5 382; GFX9-NEXT: ds_read_b32 v1, v1 offset:32 383; GFX9-NEXT: ds_read_b32 v2, v2 offset:32 384; GFX9-NEXT: s_waitcnt lgkmcnt(0) 385; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 386; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 387; GFX9-NEXT: s_endpgm 388 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 389 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 390 %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 391 %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1 392 %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 393 %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 394 395 ; Apply an additional offset after the vector that will be more obviously folded. 396 %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8 397 398 %val0 = load float, float addrspace(3)* %gep.0, align 4 399 %val1 = load float, float addrspace(3)* %gep.1.offset, align 4 400 %add.x = add nsw i32 %x.i, 8 401 %sum = fadd float %val0, %val1 402 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 403 store float %sum, float addrspace(1)* %out.gep, align 4 404 ret void 405} 406 407define amdgpu_kernel void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 { 408; CI-LABEL: read2_ptr_is_subreg_f32: 409; CI: ; %bb.0: 410; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 411; CI-NEXT: s_mov_b32 m0, -1 412; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 413; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 414; CI-NEXT: s_mov_b32 s3, 0xf000 415; CI-NEXT: s_mov_b32 s2, 0 416; CI-NEXT: s_waitcnt lgkmcnt(0) 417; CI-NEXT: v_add_f32_e32 v2, v1, v2 418; CI-NEXT: v_mov_b32_e32 v1, 0 419; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 420; CI-NEXT: s_endpgm 421; 422; GFX9-LABEL: read2_ptr_is_subreg_f32: 423; GFX9: ; %bb.0: 424; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 425; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8 426; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 427; GFX9-NEXT: s_waitcnt lgkmcnt(0) 428; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 429; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 430; GFX9-NEXT: s_endpgm 431 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 432 %ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0 433 %ptr.1 = insertelement <2 x [512 x float] addrspace(3)*> %ptr.0, [512 x float] addrspace(3)* @lds, i32 1 434 %x.i.v.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 435 %x.i.v.1 = insertelement <2 x i32> %x.i.v.0, i32 %x.i, i32 1 436 %idx = add <2 x i32> %x.i.v.1, <i32 0, i32 8> 437 %gep = getelementptr inbounds [512 x float], <2 x [512 x float] addrspace(3)*> %ptr.1, <2 x i32> <i32 0, i32 0>, <2 x i32> %idx 438 %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 439 %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 440 %val0 = load float, float addrspace(3)* %gep.0, align 4 441 %val1 = load float, float addrspace(3)* %gep.1, align 4 442 %add.x = add nsw i32 %x.i, 8 443 %sum = fadd float %val0, %val1 444 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 445 store float %sum, float addrspace(1)* %out.gep, align 4 446 ret void 447} 448 449define amdgpu_kernel void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 { 450; CI-LABEL: simple_read2_f32_volatile_0: 451; CI: ; %bb.0: 452; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 453; CI-NEXT: s_mov_b32 m0, -1 454; CI-NEXT: ds_read_b32 v1, v0 455; CI-NEXT: ds_read_b32 v2, v0 offset:32 456; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 457; CI-NEXT: s_mov_b32 s3, 0xf000 458; CI-NEXT: s_mov_b32 s2, 0 459; CI-NEXT: s_waitcnt lgkmcnt(0) 460; CI-NEXT: v_add_f32_e32 v2, v1, v2 461; CI-NEXT: v_mov_b32_e32 v1, 0 462; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 463; CI-NEXT: s_endpgm 464; 465; GFX9-LABEL: simple_read2_f32_volatile_0: 466; GFX9: ; %bb.0: 467; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 468; GFX9-NEXT: ds_read_b32 v1, v0 469; GFX9-NEXT: ds_read_b32 v2, v0 offset:32 470; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 471; GFX9-NEXT: s_waitcnt lgkmcnt(0) 472; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 473; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 474; GFX9-NEXT: s_endpgm 475 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 476 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 477 %val0 = load volatile float, float addrspace(3)* %arrayidx0, align 4 478 %add.x = add nsw i32 %x.i, 8 479 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 480 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 481 %sum = fadd float %val0, %val1 482 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 483 store float %sum, float addrspace(1)* %out.gep, align 4 484 ret void 485} 486 487define amdgpu_kernel void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 { 488; CI-LABEL: simple_read2_f32_volatile_1: 489; CI: ; %bb.0: 490; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 491; CI-NEXT: s_mov_b32 m0, -1 492; CI-NEXT: ds_read_b32 v1, v0 493; CI-NEXT: ds_read_b32 v2, v0 offset:32 494; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 495; CI-NEXT: s_mov_b32 s3, 0xf000 496; CI-NEXT: s_mov_b32 s2, 0 497; CI-NEXT: s_waitcnt lgkmcnt(0) 498; CI-NEXT: v_add_f32_e32 v2, v1, v2 499; CI-NEXT: v_mov_b32_e32 v1, 0 500; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 501; CI-NEXT: s_endpgm 502; 503; GFX9-LABEL: simple_read2_f32_volatile_1: 504; GFX9: ; %bb.0: 505; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 506; GFX9-NEXT: ds_read_b32 v1, v0 507; GFX9-NEXT: ds_read_b32 v2, v0 offset:32 508; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 509; GFX9-NEXT: s_waitcnt lgkmcnt(0) 510; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 511; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 512; GFX9-NEXT: s_endpgm 513 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 514 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 515 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 516 %add.x = add nsw i32 %x.i, 8 517 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 518 %val1 = load volatile float, float addrspace(3)* %arrayidx1, align 4 519 %sum = fadd float %val0, %val1 520 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 521 store float %sum, float addrspace(1)* %out.gep, align 4 522 ret void 523} 524 525; Can't fold since not correctly aligned. 526define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { 527; CI-LABEL: unaligned_read2_f32: 528; CI: ; %bb.0: 529; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 530; CI-NEXT: s_load_dword s0, s[0:1], 0xb 531; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 532; CI-NEXT: s_mov_b32 m0, -1 533; CI-NEXT: s_mov_b32 s7, 0xf000 534; CI-NEXT: s_mov_b32 s6, 0 535; CI-NEXT: s_waitcnt lgkmcnt(0) 536; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 537; CI-NEXT: ds_read_u8 v2, v1 offset:35 538; CI-NEXT: ds_read_u8 v3, v1 offset:34 539; CI-NEXT: ds_read_u8 v4, v1 offset:33 540; CI-NEXT: ds_read_u8 v5, v1 offset:32 541; CI-NEXT: ds_read_u8 v6, v1 offset:3 542; CI-NEXT: ds_read_u8 v7, v1 offset:2 543; CI-NEXT: ds_read_u8 v8, v1 offset:1 544; CI-NEXT: ds_read_u8 v1, v1 545; CI-NEXT: s_waitcnt lgkmcnt(7) 546; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 547; CI-NEXT: s_waitcnt lgkmcnt(3) 548; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 549; CI-NEXT: s_waitcnt lgkmcnt(2) 550; CI-NEXT: v_or_b32_e32 v6, v6, v7 551; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 552; CI-NEXT: v_or_b32_e32 v2, v2, v3 553; CI-NEXT: s_waitcnt lgkmcnt(1) 554; CI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 555; CI-NEXT: s_waitcnt lgkmcnt(0) 556; CI-NEXT: v_or_b32_e32 v1, v8, v1 557; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 558; CI-NEXT: v_or_b32_e32 v4, v4, v5 559; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 560; CI-NEXT: v_or_b32_e32 v1, v6, v1 561; CI-NEXT: v_or_b32_e32 v2, v2, v4 562; CI-NEXT: v_add_f32_e32 v2, v1, v2 563; CI-NEXT: v_mov_b32_e32 v1, 0 564; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 565; CI-NEXT: s_endpgm 566; 567; GFX9-ALIGNED-LABEL: unaligned_read2_f32: 568; GFX9-ALIGNED: ; %bb.0: 569; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 570; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c 571; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 572; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 573; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 574; GFX9-ALIGNED-NEXT: ds_read_u8 v2, v1 575; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v1 offset:1 576; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v1 offset:2 577; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v1 offset:3 578; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v1 offset:32 579; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v1 offset:33 580; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v1 offset:34 581; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v1 offset:35 582; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(6) 583; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 8, v2 584; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(4) 585; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v4 586; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 587; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(2) 588; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v6 589; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 590; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v8 591; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v3 592; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 593; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[2:3] 594; GFX9-ALIGNED-NEXT: s_endpgm 595; 596; GFX9-UNALIGNED-LABEL: unaligned_read2_f32: 597; GFX9-UNALIGNED: ; %bb.0: 598; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 599; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c 600; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 601; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 602; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s4, v2 603; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 604; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 605; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 606; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[2:3] 607; GFX9-UNALIGNED-NEXT: s_endpgm 608 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 609 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i 610 %val0 = load float, float addrspace(3)* %arrayidx0, align 1 611 %add.x = add nsw i32 %x.i, 8 612 %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x 613 %val1 = load float, float addrspace(3)* %arrayidx1, align 1 614 %sum = fadd float %val0, %val1 615 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 616 store float %sum, float addrspace(1)* %out.gep, align 4 617 ret void 618} 619 620define amdgpu_kernel void @unaligned_offset_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { 621; CI-LABEL: unaligned_offset_read2_f32: 622; CI: ; %bb.0: 623; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 624; CI-NEXT: s_load_dword s0, s[0:1], 0xb 625; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 626; CI-NEXT: s_mov_b32 m0, -1 627; CI-NEXT: s_mov_b32 s7, 0xf000 628; CI-NEXT: s_mov_b32 s6, 0 629; CI-NEXT: s_waitcnt lgkmcnt(0) 630; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 631; CI-NEXT: ds_read_u8 v2, v1 offset:12 632; CI-NEXT: ds_read_u8 v3, v1 offset:11 633; CI-NEXT: ds_read_u8 v4, v1 offset:10 634; CI-NEXT: ds_read_u8 v5, v1 offset:9 635; CI-NEXT: ds_read_u8 v6, v1 offset:8 636; CI-NEXT: ds_read_u8 v7, v1 offset:7 637; CI-NEXT: ds_read_u8 v8, v1 offset:6 638; CI-NEXT: ds_read_u8 v1, v1 offset:5 639; CI-NEXT: s_waitcnt lgkmcnt(7) 640; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 641; CI-NEXT: s_waitcnt lgkmcnt(3) 642; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 643; CI-NEXT: s_waitcnt lgkmcnt(2) 644; CI-NEXT: v_or_b32_e32 v6, v6, v7 645; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 646; CI-NEXT: v_or_b32_e32 v2, v2, v3 647; CI-NEXT: s_waitcnt lgkmcnt(1) 648; CI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 649; CI-NEXT: s_waitcnt lgkmcnt(0) 650; CI-NEXT: v_or_b32_e32 v1, v8, v1 651; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 652; CI-NEXT: v_or_b32_e32 v4, v4, v5 653; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 654; CI-NEXT: v_or_b32_e32 v1, v6, v1 655; CI-NEXT: v_or_b32_e32 v2, v2, v4 656; CI-NEXT: v_add_f32_e32 v2, v1, v2 657; CI-NEXT: v_mov_b32_e32 v1, 0 658; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 659; CI-NEXT: s_endpgm 660; 661; GFX9-ALIGNED-LABEL: unaligned_offset_read2_f32: 662; GFX9-ALIGNED: ; %bb.0: 663; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 664; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c 665; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 666; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 667; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 668; GFX9-ALIGNED-NEXT: ds_read_u8 v2, v1 offset:5 669; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v1 offset:6 670; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v1 offset:7 671; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v1 offset:8 672; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v1 offset:9 673; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v1 offset:10 674; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v1 offset:11 675; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v1 offset:12 676; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(6) 677; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 8, v2 678; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(4) 679; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v4 680; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 681; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(2) 682; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v6 683; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 684; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v8 685; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v3 686; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 687; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[2:3] 688; GFX9-ALIGNED-NEXT: s_endpgm 689; 690; GFX9-UNALIGNED-LABEL: unaligned_offset_read2_f32: 691; GFX9-UNALIGNED: ; %bb.0: 692; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 693; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c 694; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 695; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 696; GFX9-UNALIGNED-NEXT: v_add3_u32 v0, s4, v2, 5 697; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 698; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 699; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 700; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[2:3] 701; GFX9-UNALIGNED-NEXT: s_endpgm 702 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 703 %base = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i 704 %base.i8 = bitcast float addrspace(3)* %base to i8 addrspace(3)* 705 %addr0.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 5 706 %addr0 = bitcast i8 addrspace(3)* %addr0.i8 to float addrspace(3)* 707 %val0 = load float, float addrspace(3)* %addr0, align 1 708 %addr1.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 9 709 %addr1 = bitcast i8 addrspace(3)* %addr1.i8 to float addrspace(3)* 710 %val1 = load float, float addrspace(3)* %addr1, align 1 711 %sum = fadd float %val0, %val1 712 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 713 store float %sum, float addrspace(1)* %out.gep, align 4 714 ret void 715} 716 717define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { 718; CI-LABEL: misaligned_2_simple_read2_f32: 719; CI: ; %bb.0: 720; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 721; CI-NEXT: s_load_dword s0, s[0:1], 0xb 722; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 723; CI-NEXT: s_mov_b32 m0, -1 724; CI-NEXT: s_mov_b32 s7, 0xf000 725; CI-NEXT: s_mov_b32 s6, 0 726; CI-NEXT: s_waitcnt lgkmcnt(0) 727; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 728; CI-NEXT: ds_read_u16 v2, v1 offset:34 729; CI-NEXT: ds_read_u16 v3, v1 offset:32 730; CI-NEXT: ds_read_u16 v4, v1 offset:2 731; CI-NEXT: ds_read_u16 v1, v1 732; CI-NEXT: s_waitcnt lgkmcnt(3) 733; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 734; CI-NEXT: s_waitcnt lgkmcnt(2) 735; CI-NEXT: v_or_b32_e32 v2, v2, v3 736; CI-NEXT: s_waitcnt lgkmcnt(1) 737; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 738; CI-NEXT: s_waitcnt lgkmcnt(0) 739; CI-NEXT: v_or_b32_e32 v1, v4, v1 740; CI-NEXT: v_add_f32_e32 v2, v1, v2 741; CI-NEXT: v_mov_b32_e32 v1, 0 742; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 743; CI-NEXT: s_endpgm 744; 745; GFX9-ALIGNED-LABEL: misaligned_2_simple_read2_f32: 746; GFX9-ALIGNED: ; %bb.0: 747; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 748; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c 749; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 750; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 751; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 752; GFX9-ALIGNED-NEXT: ds_read_u16 v2, v1 753; GFX9-ALIGNED-NEXT: ds_read_u16 v3, v1 offset:2 754; GFX9-ALIGNED-NEXT: ds_read_u16 v4, v1 offset:32 755; GFX9-ALIGNED-NEXT: ds_read_u16 v1, v1 offset:34 756; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(2) 757; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 758; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 759; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v4 760; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 761; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[2:3] 762; GFX9-ALIGNED-NEXT: s_endpgm 763; 764; GFX9-UNALIGNED-LABEL: misaligned_2_simple_read2_f32: 765; GFX9-UNALIGNED: ; %bb.0: 766; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 767; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c 768; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 769; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 770; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s4, v2 771; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 772; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 773; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 774; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[2:3] 775; GFX9-UNALIGNED-NEXT: s_endpgm 776 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 777 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i 778 %val0 = load float, float addrspace(3)* %arrayidx0, align 2 779 %add.x = add nsw i32 %x.i, 8 780 %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x 781 %val1 = load float, float addrspace(3)* %arrayidx1, align 2 782 %sum = fadd float %val0, %val1 783 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 784 store float %sum, float addrspace(1)* %out.gep, align 4 785 ret void 786} 787 788define amdgpu_kernel void @simple_read2_f64(double addrspace(1)* %out) #0 { 789; CI-LABEL: simple_read2_f64: 790; CI: ; %bb.0: 791; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 792; CI-NEXT: s_mov_b32 m0, -1 793; CI-NEXT: ds_read2_b64 v[0:3], v4 offset1:8 794; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 795; CI-NEXT: s_mov_b32 s3, 0xf000 796; CI-NEXT: s_mov_b32 s2, 0 797; CI-NEXT: v_mov_b32_e32 v5, 0 798; CI-NEXT: s_waitcnt lgkmcnt(0) 799; CI-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 800; CI-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64 801; CI-NEXT: s_endpgm 802; 803; GFX9-LABEL: simple_read2_f64: 804; GFX9: ; %bb.0: 805; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 806; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:8 807; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 808; GFX9-NEXT: s_waitcnt lgkmcnt(0) 809; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 810; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 811; GFX9-NEXT: s_endpgm 812 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 813 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i 814 %val0 = load double, double addrspace(3)* %arrayidx0, align 8 815 %add.x = add nsw i32 %x.i, 8 816 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x 817 %val1 = load double, double addrspace(3)* %arrayidx1, align 8 818 %sum = fadd double %val0, %val1 819 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i 820 store double %sum, double addrspace(1)* %out.gep, align 8 821 ret void 822} 823 824define amdgpu_kernel void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 { 825; CI-LABEL: simple_read2_f64_max_offset: 826; CI: ; %bb.0: 827; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 828; CI-NEXT: s_mov_b32 m0, -1 829; CI-NEXT: ds_read2_b64 v[0:3], v4 offset1:255 830; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 831; CI-NEXT: s_mov_b32 s3, 0xf000 832; CI-NEXT: s_mov_b32 s2, 0 833; CI-NEXT: v_mov_b32_e32 v5, 0 834; CI-NEXT: s_waitcnt lgkmcnt(0) 835; CI-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 836; CI-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64 837; CI-NEXT: s_endpgm 838; 839; GFX9-LABEL: simple_read2_f64_max_offset: 840; GFX9: ; %bb.0: 841; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 842; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:255 843; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 844; GFX9-NEXT: s_waitcnt lgkmcnt(0) 845; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 846; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 847; GFX9-NEXT: s_endpgm 848 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 849 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i 850 %val0 = load double, double addrspace(3)* %arrayidx0, align 8 851 %add.x = add nsw i32 %x.i, 255 852 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x 853 %val1 = load double, double addrspace(3)* %arrayidx1, align 8 854 %sum = fadd double %val0, %val1 855 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i 856 store double %sum, double addrspace(1)* %out.gep, align 8 857 ret void 858} 859 860define amdgpu_kernel void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 { 861; CI-LABEL: simple_read2_f64_too_far: 862; CI: ; %bb.0: 863; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 864; CI-NEXT: s_mov_b32 m0, -1 865; CI-NEXT: ds_read_b64 v[1:2], v0 866; CI-NEXT: ds_read_b64 v[3:4], v0 offset:2056 867; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 868; CI-NEXT: s_mov_b32 s3, 0xf000 869; CI-NEXT: s_mov_b32 s2, 0 870; CI-NEXT: s_waitcnt lgkmcnt(0) 871; CI-NEXT: v_add_f64 v[2:3], v[1:2], v[3:4] 872; CI-NEXT: v_mov_b32_e32 v1, 0 873; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 874; CI-NEXT: s_endpgm 875; 876; GFX9-LABEL: simple_read2_f64_too_far: 877; GFX9: ; %bb.0: 878; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 879; GFX9-NEXT: ds_read_b64 v[0:1], v4 880; GFX9-NEXT: ds_read_b64 v[2:3], v4 offset:2056 881; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 882; GFX9-NEXT: s_waitcnt lgkmcnt(0) 883; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 884; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 885; GFX9-NEXT: s_endpgm 886 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 887 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i 888 %val0 = load double, double addrspace(3)* %arrayidx0, align 8 889 %add.x = add nsw i32 %x.i, 257 890 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x 891 %val1 = load double, double addrspace(3)* %arrayidx1, align 8 892 %sum = fadd double %val0, %val1 893 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i 894 store double %sum, double addrspace(1)* %out.gep, align 8 895 ret void 896} 897 898; Alignment only 4 899define amdgpu_kernel void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { 900; CI-LABEL: misaligned_read2_f64: 901; CI: ; %bb.0: 902; CI-NEXT: s_load_dword s2, s[0:1], 0xb 903; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 904; CI-NEXT: s_mov_b32 m0, -1 905; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 906; CI-NEXT: s_mov_b32 s3, 0xf000 907; CI-NEXT: s_waitcnt lgkmcnt(0) 908; CI-NEXT: v_add_i32_e32 v3, vcc, s2, v0 909; CI-NEXT: ds_read2_b32 v[1:2], v3 offset1:1 910; CI-NEXT: ds_read2_b32 v[3:4], v3 offset0:14 offset1:15 911; CI-NEXT: s_mov_b32 s2, 0 912; CI-NEXT: s_waitcnt lgkmcnt(0) 913; CI-NEXT: v_add_f64 v[2:3], v[1:2], v[3:4] 914; CI-NEXT: v_mov_b32_e32 v1, 0 915; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 916; CI-NEXT: s_endpgm 917; 918; GFX9-LABEL: misaligned_read2_f64: 919; GFX9: ; %bb.0: 920; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 921; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 922; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 923; GFX9-NEXT: s_waitcnt lgkmcnt(0) 924; GFX9-NEXT: v_add_u32_e32 v2, s2, v4 925; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 926; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:14 offset1:15 927; GFX9-NEXT: s_waitcnt lgkmcnt(0) 928; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 929; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 930; GFX9-NEXT: s_endpgm 931 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 932 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i 933 %val0 = load double, double addrspace(3)* %arrayidx0, align 4 934 %add.x = add nsw i32 %x.i, 7 935 %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x 936 %val1 = load double, double addrspace(3)* %arrayidx1, align 4 937 %sum = fadd double %val0, %val1 938 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i 939 store double %sum, double addrspace(1)* %out.gep, align 4 940 ret void 941} 942 943@foo = addrspace(3) global [4 x i32] undef, align 4 944 945define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) { 946; CI-LABEL: load_constant_adjacent_offsets: 947; CI: ; %bb.0: 948; CI-NEXT: v_mov_b32_e32 v0, 0 949; CI-NEXT: s_mov_b32 m0, -1 950; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 951; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 952; CI-NEXT: s_mov_b32 s3, 0xf000 953; CI-NEXT: s_mov_b32 s2, -1 954; CI-NEXT: s_waitcnt lgkmcnt(0) 955; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 956; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 957; CI-NEXT: s_endpgm 958; 959; GFX9-LABEL: load_constant_adjacent_offsets: 960; GFX9: ; %bb.0: 961; GFX9-NEXT: v_mov_b32_e32 v2, 0 962; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 963; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 964; GFX9-NEXT: s_waitcnt lgkmcnt(0) 965; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 966; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 967; GFX9-NEXT: s_endpgm 968 %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 969 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 970 %sum = add i32 %val0, %val1 971 store i32 %sum, i32 addrspace(1)* %out, align 4 972 ret void 973} 974 975define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) { 976; CI-LABEL: load_constant_disjoint_offsets: 977; CI: ; %bb.0: 978; CI-NEXT: v_mov_b32_e32 v0, 0 979; CI-NEXT: s_mov_b32 m0, -1 980; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:2 981; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 982; CI-NEXT: s_mov_b32 s3, 0xf000 983; CI-NEXT: s_mov_b32 s2, -1 984; CI-NEXT: s_waitcnt lgkmcnt(0) 985; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 986; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 987; CI-NEXT: s_endpgm 988; 989; GFX9-LABEL: load_constant_disjoint_offsets: 990; GFX9: ; %bb.0: 991; GFX9-NEXT: v_mov_b32_e32 v2, 0 992; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:2 993; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 994; GFX9-NEXT: s_waitcnt lgkmcnt(0) 995; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 996; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 997; GFX9-NEXT: s_endpgm 998 %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 999 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 1000 %sum = add i32 %val0, %val1 1001 store i32 %sum, i32 addrspace(1)* %out, align 4 1002 ret void 1003} 1004 1005@bar = addrspace(3) global [4 x i64] undef, align 4 1006 1007define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) { 1008; CI-LABEL: load_misaligned64_constant_offsets: 1009; CI: ; %bb.0: 1010; CI-NEXT: v_mov_b32_e32 v2, 0 1011; CI-NEXT: s_mov_b32 m0, -1 1012; CI-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 1013; CI-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 1014; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1015; CI-NEXT: s_mov_b32 s3, 0xf000 1016; CI-NEXT: s_mov_b32 s2, -1 1017; CI-NEXT: s_waitcnt lgkmcnt(0) 1018; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1019; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 1020; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1021; CI-NEXT: s_endpgm 1022; 1023; GFX9-ALIGNED-LABEL: load_misaligned64_constant_offsets: 1024; GFX9-ALIGNED: ; %bb.0: 1025; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v4, 0 1026; GFX9-ALIGNED-NEXT: ds_read2_b32 v[0:1], v4 offset1:1 1027; GFX9-ALIGNED-NEXT: ds_read2_b32 v[2:3], v4 offset0:2 offset1:3 1028; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1029; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 1030; GFX9-ALIGNED-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 1031; GFX9-ALIGNED-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 1032; GFX9-ALIGNED-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 1033; GFX9-ALIGNED-NEXT: s_endpgm 1034; 1035; GFX9-UNALIGNED-LABEL: load_misaligned64_constant_offsets: 1036; GFX9-UNALIGNED: ; %bb.0: 1037; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, 0 1038; GFX9-UNALIGNED-NEXT: ds_read_b128 v[0:3], v4 1039; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1040; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 1041; GFX9-UNALIGNED-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 1042; GFX9-UNALIGNED-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 1043; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 1044; GFX9-UNALIGNED-NEXT: s_endpgm 1045 %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 1046 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 1047 %sum = add i64 %val0, %val1 1048 store i64 %sum, i64 addrspace(1)* %out, align 8 1049 ret void 1050} 1051 1052@bar.large = addrspace(3) global [4096 x i64] undef, align 4 1053 1054define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) { 1055; CI-LABEL: load_misaligned64_constant_large_offsets: 1056; CI: ; %bb.0: 1057; CI-NEXT: v_mov_b32_e32 v0, 0x4000 1058; CI-NEXT: v_mov_b32_e32 v2, 0x7ff8 1059; CI-NEXT: s_mov_b32 m0, -1 1060; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 1061; CI-NEXT: ds_read2_b32 v[2:3], v2 offset1:1 1062; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1063; CI-NEXT: s_mov_b32 s3, 0xf000 1064; CI-NEXT: s_mov_b32 s2, -1 1065; CI-NEXT: s_waitcnt lgkmcnt(0) 1066; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1067; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 1068; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1069; CI-NEXT: s_endpgm 1070; 1071; GFX9-LABEL: load_misaligned64_constant_large_offsets: 1072; GFX9: ; %bb.0: 1073; GFX9-NEXT: v_mov_b32_e32 v0, 0x4000 1074; GFX9-NEXT: v_mov_b32_e32 v2, 0x7ff8 1075; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 1076; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset1:1 1077; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1078; GFX9-NEXT: v_mov_b32_e32 v4, 0 1079; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1080; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 1081; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 1082; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 1083; GFX9-NEXT: s_endpgm 1084 %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 1085 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 1086 %sum = add i64 %val0, %val1 1087 store i64 %sum, i64 addrspace(1)* %out, align 8 1088 ret void 1089} 1090 1091@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4 1092@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4 1093 1094define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 { 1095; CI-LABEL: sgemm_inner_loop_read2_sequence: 1096; CI: ; %bb.0: 1097; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1098; CI-NEXT: s_lshl_b32 s0, s2, 2 1099; CI-NEXT: s_add_i32 s1, s0, 0xc20 1100; CI-NEXT: s_addk_i32 s0, 0xc60 1101; CI-NEXT: v_mov_b32_e32 v0, s1 1102; CI-NEXT: v_mov_b32_e32 v4, s0 1103; CI-NEXT: s_mov_b32 m0, -1 1104; CI-NEXT: ds_read2_b32 v[2:3], v0 offset1:1 1105; CI-NEXT: ds_read2_b32 v[4:5], v4 offset1:1 1106; CI-NEXT: v_lshlrev_b32_e32 v8, 2, v1 1107; CI-NEXT: ds_read2_b32 v[0:1], v8 offset1:1 1108; CI-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33 1109; CI-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65 1110; CI-NEXT: s_mov_b32 s7, 0xf000 1111; CI-NEXT: s_waitcnt lgkmcnt(0) 1112; CI-NEXT: v_add_f32_e32 v2, v2, v3 1113; CI-NEXT: v_add_f32_e32 v2, v2, v4 1114; CI-NEXT: v_add_f32_e32 v2, v2, v5 1115; CI-NEXT: v_add_f32_e32 v0, v2, v0 1116; CI-NEXT: v_add_f32_e32 v0, v0, v1 1117; CI-NEXT: v_add_f32_e32 v0, v0, v6 1118; CI-NEXT: v_add_f32_e32 v0, v0, v7 1119; CI-NEXT: v_add_f32_e32 v0, v0, v8 1120; CI-NEXT: s_mov_b32 s6, -1 1121; CI-NEXT: v_add_f32_e32 v0, v0, v9 1122; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1123; CI-NEXT: s_endpgm 1124; 1125; GFX9-LABEL: sgemm_inner_loop_read2_sequence: 1126; GFX9: ; %bb.0: 1127; GFX9-NEXT: s_lshl_b32 s2, s2, 2 1128; GFX9-NEXT: s_add_i32 s3, s2, 0xc20 1129; GFX9-NEXT: s_addk_i32 s2, 0xc60 1130; GFX9-NEXT: v_mov_b32_e32 v0, s3 1131; GFX9-NEXT: v_mov_b32_e32 v4, s2 1132; GFX9-NEXT: ds_read2_b32 v[2:3], v0 offset1:1 1133; GFX9-NEXT: ds_read2_b32 v[4:5], v4 offset1:1 1134; GFX9-NEXT: v_lshlrev_b32_e32 v8, 2, v1 1135; GFX9-NEXT: ds_read2_b32 v[0:1], v8 offset1:1 1136; GFX9-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33 1137; GFX9-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65 1138; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1139; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1140; GFX9-NEXT: v_add_f32_e32 v2, v2, v3 1141; GFX9-NEXT: v_add_f32_e32 v2, v2, v4 1142; GFX9-NEXT: v_add_f32_e32 v2, v2, v5 1143; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 1144; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 1145; GFX9-NEXT: v_add_f32_e32 v0, v0, v6 1146; GFX9-NEXT: v_add_f32_e32 v0, v0, v7 1147; GFX9-NEXT: v_add_f32_e32 v0, v0, v8 1148; GFX9-NEXT: v_mov_b32_e32 v10, 0 1149; GFX9-NEXT: v_add_f32_e32 v0, v0, v9 1150; GFX9-NEXT: global_store_dword v10, v0, s[0:1] 1151; GFX9-NEXT: s_endpgm 1152 %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1 1153 %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1 1154 %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i 1155 %tmp16 = load float, float addrspace(3)* %arrayidx44, align 4 1156 %add47 = add nsw i32 %x.i, 1 1157 %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47 1158 %tmp17 = load float, float addrspace(3)* %arrayidx48, align 4 1159 %add51 = add nsw i32 %x.i, 16 1160 %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51 1161 %tmp18 = load float, float addrspace(3)* %arrayidx52, align 4 1162 %add55 = add nsw i32 %x.i, 17 1163 %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55 1164 %tmp19 = load float, float addrspace(3)* %arrayidx56, align 4 1165 %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i 1166 %tmp20 = load float, float addrspace(3)* %arrayidx60, align 4 1167 %add63 = add nsw i32 %y.i, 1 1168 %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63 1169 %tmp21 = load float, float addrspace(3)* %arrayidx64, align 4 1170 %add67 = add nsw i32 %y.i, 32 1171 %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67 1172 %tmp22 = load float, float addrspace(3)* %arrayidx68, align 4 1173 %add71 = add nsw i32 %y.i, 33 1174 %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71 1175 %tmp23 = load float, float addrspace(3)* %arrayidx72, align 4 1176 %add75 = add nsw i32 %y.i, 64 1177 %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75 1178 %tmp24 = load float, float addrspace(3)* %arrayidx76, align 4 1179 %add79 = add nsw i32 %y.i, 65 1180 %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79 1181 %tmp25 = load float, float addrspace(3)* %arrayidx80, align 4 1182 %sum.0 = fadd float %tmp16, %tmp17 1183 %sum.1 = fadd float %sum.0, %tmp18 1184 %sum.2 = fadd float %sum.1, %tmp19 1185 %sum.3 = fadd float %sum.2, %tmp20 1186 %sum.4 = fadd float %sum.3, %tmp21 1187 %sum.5 = fadd float %sum.4, %tmp22 1188 %sum.6 = fadd float %sum.5, %tmp23 1189 %sum.7 = fadd float %sum.6, %tmp24 1190 %sum.8 = fadd float %sum.7, %tmp25 1191 store float %sum.8, float addrspace(1)* %C, align 4 1192 ret void 1193} 1194 1195define amdgpu_kernel void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 { 1196; CI-LABEL: misaligned_read2_v2i32: 1197; CI: ; %bb.0: 1198; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1199; CI-NEXT: s_load_dword s0, s[0:1], 0xb 1200; CI-NEXT: s_mov_b32 m0, -1 1201; CI-NEXT: s_mov_b32 s7, 0xf000 1202; CI-NEXT: s_mov_b32 s6, -1 1203; CI-NEXT: s_waitcnt lgkmcnt(0) 1204; CI-NEXT: v_mov_b32_e32 v0, s0 1205; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 1206; CI-NEXT: s_waitcnt lgkmcnt(0) 1207; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1208; CI-NEXT: s_endpgm 1209; 1210; GFX9-LABEL: misaligned_read2_v2i32: 1211; GFX9: ; %bb.0: 1212; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1213; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 1214; GFX9-NEXT: v_mov_b32_e32 v2, 0 1215; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1216; GFX9-NEXT: v_mov_b32_e32 v0, s4 1217; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 1218; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1219; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 1220; GFX9-NEXT: s_endpgm 1221 %load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4 1222 store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8 1223 ret void 1224} 1225 1226define amdgpu_kernel void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 { 1227; CI-LABEL: misaligned_read2_i64: 1228; CI: ; %bb.0: 1229; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1230; CI-NEXT: s_load_dword s0, s[0:1], 0xb 1231; CI-NEXT: s_mov_b32 m0, -1 1232; CI-NEXT: s_mov_b32 s7, 0xf000 1233; CI-NEXT: s_mov_b32 s6, -1 1234; CI-NEXT: s_waitcnt lgkmcnt(0) 1235; CI-NEXT: v_mov_b32_e32 v0, s0 1236; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 1237; CI-NEXT: s_waitcnt lgkmcnt(0) 1238; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1239; CI-NEXT: s_endpgm 1240; 1241; GFX9-LABEL: misaligned_read2_i64: 1242; GFX9: ; %bb.0: 1243; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1244; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 1245; GFX9-NEXT: v_mov_b32_e32 v2, 0 1246; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1247; GFX9-NEXT: v_mov_b32_e32 v0, s4 1248; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 1249; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1250; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 1251; GFX9-NEXT: s_endpgm 1252 %load = load i64, i64 addrspace(3)* %in, align 4 1253 store i64 %load, i64 addrspace(1)* %out, align 8 1254 ret void 1255} 1256 1257define amdgpu_kernel void @ds_read_diff_base_interleaving( 1258; CI-LABEL: ds_read_diff_base_interleaving: 1259; CI: ; %bb.0: ; %bb 1260; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1261; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 1262; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v1 1263; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1264; CI-NEXT: s_mov_b32 m0, -1 1265; CI-NEXT: s_mov_b32 s7, 0xf000 1266; CI-NEXT: s_waitcnt lgkmcnt(0) 1267; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v1 1268; CI-NEXT: v_add_i32_e32 v4, vcc, s1, v0 1269; CI-NEXT: v_add_i32_e32 v3, vcc, s2, v1 1270; CI-NEXT: v_add_i32_e32 v6, vcc, s3, v0 1271; CI-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 1272; CI-NEXT: ds_read2_b32 v[2:3], v3 offset1:1 1273; CI-NEXT: ds_read2_b32 v[4:5], v4 offset1:4 1274; CI-NEXT: s_mov_b32 s6, -1 1275; CI-NEXT: s_waitcnt lgkmcnt(0) 1276; CI-NEXT: v_mul_f32_e32 v0, v0, v4 1277; CI-NEXT: v_add_f32_e32 v4, 2.0, v0 1278; CI-NEXT: v_mul_f32_e32 v5, v1, v5 1279; CI-NEXT: ds_read2_b32 v[0:1], v6 offset1:4 1280; CI-NEXT: s_waitcnt lgkmcnt(0) 1281; CI-NEXT: v_mul_f32_e32 v0, v2, v0 1282; CI-NEXT: v_sub_f32_e32 v0, v4, v0 1283; CI-NEXT: v_sub_f32_e32 v0, v0, v5 1284; CI-NEXT: v_mul_f32_e32 v1, v3, v1 1285; CI-NEXT: v_sub_f32_e32 v0, v0, v1 1286; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:40 1287; CI-NEXT: s_endpgm 1288; 1289; GFX9-LABEL: ds_read_diff_base_interleaving: 1290; GFX9: ; %bb.0: ; %bb 1291; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1292; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 1293; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 1294; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1295; GFX9-NEXT: v_mov_b32_e32 v8, 0 1296; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1297; GFX9-NEXT: v_add_u32_e32 v2, s4, v1 1298; GFX9-NEXT: v_add_u32_e32 v3, s5, v0 1299; GFX9-NEXT: v_add_u32_e32 v4, s6, v1 1300; GFX9-NEXT: v_add_u32_e32 v6, s7, v0 1301; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 1302; GFX9-NEXT: ds_read2_b32 v[2:3], v3 offset1:4 1303; GFX9-NEXT: ds_read2_b32 v[4:5], v4 offset1:1 1304; GFX9-NEXT: ds_read2_b32 v[6:7], v6 offset1:4 1305; GFX9-NEXT: s_waitcnt lgkmcnt(2) 1306; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 1307; GFX9-NEXT: v_add_f32_e32 v0, 2.0, v0 1308; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1309; GFX9-NEXT: v_mul_f32_e32 v2, v4, v6 1310; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 1311; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 1312; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 1313; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7 1314; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 1315; GFX9-NEXT: global_store_dword v8, v0, s[2:3] offset:40 1316; GFX9-NEXT: s_endpgm 1317 float addrspace(1)* nocapture %arg, 1318 [4 x [4 x float]] addrspace(3)* %arg1, 1319 [4 x [4 x float]] addrspace(3)* %arg2, 1320 [4 x [4 x float]] addrspace(3)* %arg3, 1321 [4 x [4 x float]] addrspace(3)* %arg4) #1 { 1322bb: 1323 %tmp = getelementptr float, float addrspace(1)* %arg, i64 10 1324 %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.x() #2 1325 %tmp6 = tail call i32 @llvm.amdgcn.workitem.id.y() #2 1326 %tmp7 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg1, i32 0, i32 %tmp6, i32 0 1327 %tmp8 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg2, i32 0, i32 0, i32 %tmp5 1328 %tmp9 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg3, i32 0, i32 %tmp6, i32 0 1329 %tmp10 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg4, i32 0, i32 0, i32 %tmp5 1330 %tmp11 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg1, i32 0, i32 %tmp6, i32 1 1331 %tmp12 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg2, i32 0, i32 1, i32 %tmp5 1332 %tmp13 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg3, i32 0, i32 %tmp6, i32 1 1333 %tmp14 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg4, i32 0, i32 1, i32 %tmp5 1334 %tmp15 = load float, float addrspace(3)* %tmp7 1335 %tmp16 = load float, float addrspace(3)* %tmp8 1336 %tmp17 = fmul float %tmp15, %tmp16 1337 %tmp18 = fadd float 2.000000e+00, %tmp17 1338 %tmp19 = load float, float addrspace(3)* %tmp9 1339 %tmp20 = load float, float addrspace(3)* %tmp10 1340 %tmp21 = fmul float %tmp19, %tmp20 1341 %tmp22 = fsub float %tmp18, %tmp21 1342 %tmp23 = load float, float addrspace(3)* %tmp11 1343 %tmp24 = load float, float addrspace(3)* %tmp12 1344 %tmp25 = fmul float %tmp23, %tmp24 1345 %tmp26 = fsub float %tmp22, %tmp25 1346 %tmp27 = load float, float addrspace(3)* %tmp13 1347 %tmp28 = load float, float addrspace(3)* %tmp14 1348 %tmp29 = fmul float %tmp27, %tmp28 1349 %tmp30 = fsub float %tmp26, %tmp29 1350 store float %tmp30, float addrspace(1)* %tmp 1351 ret void 1352} 1353 1354define amdgpu_kernel void @ds_read_call_read(i32 addrspace(1)* %out, i32 addrspace(3)* %arg) { 1355; CI-LABEL: ds_read_call_read: 1356; CI: ; %bb.0: 1357; CI-NEXT: s_getpc_b64 s[40:41] 1358; CI-NEXT: s_mov_b32 s40, s0 1359; CI-NEXT: s_load_dwordx4 s[40:43], s[40:41], 0x0 1360; CI-NEXT: s_load_dwordx2 s[36:37], s[0:1], 0x9 1361; CI-NEXT: s_load_dword s0, s[0:1], 0xb 1362; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1363; CI-NEXT: s_mov_b32 m0, -1 1364; CI-NEXT: s_mov_b32 s32, 0 1365; CI-NEXT: s_waitcnt lgkmcnt(0) 1366; CI-NEXT: s_add_u32 s40, s40, s3 1367; CI-NEXT: s_addc_u32 s41, s41, 0 1368; CI-NEXT: v_add_i32_e32 v40, vcc, s0, v0 1369; CI-NEXT: s_getpc_b64 s[0:1] 1370; CI-NEXT: s_add_u32 s0, s0, void_func_void@gotpcrel32@lo+4 1371; CI-NEXT: s_addc_u32 s1, s1, void_func_void@gotpcrel32@hi+12 1372; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 1373; CI-NEXT: ds_read_b32 v41, v40 1374; CI-NEXT: s_mov_b64 s[0:1], s[40:41] 1375; CI-NEXT: s_mov_b64 s[2:3], s[42:43] 1376; CI-NEXT: s_mov_b32 s39, 0xf000 1377; CI-NEXT: s_mov_b32 s38, -1 1378; CI-NEXT: s_waitcnt lgkmcnt(0) 1379; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] 1380; CI-NEXT: ds_read_b32 v0, v40 offset:4 1381; CI-NEXT: s_waitcnt lgkmcnt(0) 1382; CI-NEXT: v_add_i32_e32 v0, vcc, v41, v0 1383; CI-NEXT: buffer_store_dword v0, off, s[36:39], 0 1384; CI-NEXT: s_endpgm 1385; 1386; GFX9-LABEL: ds_read_call_read: 1387; GFX9: ; %bb.0: 1388; GFX9-NEXT: s_getpc_b64 s[36:37] 1389; GFX9-NEXT: s_mov_b32 s36, s0 1390; GFX9-NEXT: s_load_dwordx4 s[36:39], s[36:37], 0x0 1391; GFX9-NEXT: s_nop 0 1392; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 1393; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 1394; GFX9-NEXT: s_mov_b32 s32, 0 1395; GFX9-NEXT: v_mov_b32_e32 v40, 0 1396; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1397; GFX9-NEXT: s_add_u32 s36, s36, s3 1398; GFX9-NEXT: s_addc_u32 s37, s37, 0 1399; GFX9-NEXT: s_getpc_b64 s[0:1] 1400; GFX9-NEXT: s_add_u32 s0, s0, void_func_void@gotpcrel32@lo+4 1401; GFX9-NEXT: s_addc_u32 s1, s1, void_func_void@gotpcrel32@hi+12 1402; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s2 1403; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 1404; GFX9-NEXT: ds_read_b32 v42, v41 1405; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] 1406; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] 1407; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1408; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] 1409; GFX9-NEXT: ds_read_b32 v0, v41 offset:4 1410; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1411; GFX9-NEXT: v_add_u32_e32 v0, v42, v0 1412; GFX9-NEXT: global_store_dword v40, v0, s[34:35] 1413; GFX9-NEXT: s_endpgm 1414 %x = call i32 @llvm.amdgcn.workitem.id.x() 1415 %arrayidx0 = getelementptr i32, i32 addrspace(3)* %arg, i32 %x 1416 %arrayidx1 = getelementptr i32, i32 addrspace(3)* %arrayidx0, i32 1 1417 %v0 = load i32, i32 addrspace(3)* %arrayidx0, align 4 1418 call void @void_func_void() 1419 %v1 = load i32, i32 addrspace(3)* %arrayidx1, align 4 1420 %r = add i32 %v0, %v1 1421 store i32 %r, i32 addrspace(1)* %out, align 4 1422 ret void 1423} 1424 1425define amdgpu_ps <2 x float> @ds_read_interp_read(i32 inreg %prims, float addrspace(3)* %inptr) { 1426; CI-LABEL: ds_read_interp_read: 1427; CI: ; %bb.0: 1428; CI-NEXT: s_mov_b32 m0, -1 1429; CI-NEXT: ds_read_b32 v2, v0 1430; CI-NEXT: s_mov_b32 m0, s0 1431; CI-NEXT: v_interp_mov_f32 v1, p10, attr0.x 1432; CI-NEXT: s_mov_b32 m0, -1 1433; CI-NEXT: ds_read_b32 v0, v0 offset:16 1434; CI-NEXT: s_waitcnt lgkmcnt(0) 1435; CI-NEXT: v_add_f32_e32 v1, v0, v1 1436; CI-NEXT: v_mov_b32_e32 v0, v2 1437; CI-NEXT: ; return to shader part epilog 1438; 1439; GFX9-LABEL: ds_read_interp_read: 1440; GFX9: ; %bb.0: 1441; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:4 1442; GFX9-NEXT: s_mov_b32 m0, s0 1443; GFX9-NEXT: s_nop 0 1444; GFX9-NEXT: v_interp_mov_f32_e32 v2, p10, attr0.x 1445; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1446; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 1447; GFX9-NEXT: ; return to shader part epilog 1448 %v0 = load float, float addrspace(3)* %inptr, align 4 1449 %intrp = call float @llvm.amdgcn.interp.mov(i32 0, i32 0, i32 0, i32 %prims) 1450 %ptr1 = getelementptr float, float addrspace(3)* %inptr, i32 4 1451 %v1 = load float, float addrspace(3)* %ptr1, align 4 1452 %v1b = fadd float %v1, %intrp 1453 %r0 = insertelement <2 x float> undef, float %v0, i32 0 1454 %r1 = insertelement <2 x float> %r0, float %v1b, i32 1 1455 ret <2 x float> %r1 1456} 1457 1458@v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1 1459 1460define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)* %out) { 1461; CI-LABEL: read2_v2i32_align1_odd_offset: 1462; CI: ; %bb.0: ; %entry 1463; CI-NEXT: v_mov_b32_e32 v0, 0 1464; CI-NEXT: s_mov_b32 m0, -1 1465; CI-NEXT: ds_read_u8 v1, v0 offset:72 1466; CI-NEXT: ds_read_u8 v2, v0 offset:71 1467; CI-NEXT: ds_read_u8 v3, v0 offset:70 1468; CI-NEXT: ds_read_u8 v4, v0 offset:69 1469; CI-NEXT: ds_read_u8 v5, v0 offset:68 1470; CI-NEXT: s_waitcnt lgkmcnt(4) 1471; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1472; CI-NEXT: s_waitcnt lgkmcnt(3) 1473; CI-NEXT: v_or_b32_e32 v1, v1, v2 1474; CI-NEXT: s_waitcnt lgkmcnt(2) 1475; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 1476; CI-NEXT: s_waitcnt lgkmcnt(1) 1477; CI-NEXT: v_or_b32_e32 v3, v3, v4 1478; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1479; CI-NEXT: v_or_b32_e32 v1, v1, v3 1480; CI-NEXT: ds_read_u8 v2, v0 offset:67 1481; CI-NEXT: ds_read_u8 v3, v0 offset:66 1482; CI-NEXT: ds_read_u8 v0, v0 offset:65 1483; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1484; CI-NEXT: s_mov_b32 s3, 0xf000 1485; CI-NEXT: s_mov_b32 s2, -1 1486; CI-NEXT: s_waitcnt lgkmcnt(0) 1487; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 1488; CI-NEXT: v_or_b32_e32 v0, v3, v0 1489; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 1490; CI-NEXT: v_or_b32_e32 v2, v3, v2 1491; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1492; CI-NEXT: v_or_b32_e32 v0, v2, v0 1493; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1494; CI-NEXT: s_endpgm 1495; 1496; GFX9-ALIGNED-LABEL: read2_v2i32_align1_odd_offset: 1497; GFX9-ALIGNED: ; %bb.0: ; %entry 1498; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, 0 1499; GFX9-ALIGNED-NEXT: ds_read_u8 v0, v2 offset:65 1500; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v2 offset:66 1501; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v2 offset:67 1502; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v2 offset:68 1503; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v2 offset:69 1504; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v2 offset:70 1505; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v2 offset:71 1506; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v2 offset:72 1507; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1508; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 1509; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v3 1510; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v6, 8, v6 1511; GFX9-ALIGNED-NEXT: v_or_b32_e32 v1, v6, v1 1512; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v6, 8, v8 1513; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v3, v0 1514; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v5 1515; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1516; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1517; GFX9-ALIGNED-NEXT: v_or_b32_e32 v1, v6, v1 1518; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v3, v0 1519; GFX9-ALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1520; GFX9-ALIGNED-NEXT: s_endpgm 1521; 1522; GFX9-UNALIGNED-LABEL: read2_v2i32_align1_odd_offset: 1523; GFX9-UNALIGNED: ; %bb.0: ; %entry 1524; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x41 1525; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1526; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 1527; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0 1528; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 1529; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1530; GFX9-UNALIGNED-NEXT: s_endpgm 1531entry: 1532 %load = load <2 x i32>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1 1533 store <2 x i32> %load, <2 x i32> addrspace(1)* %out 1534 ret void 1535} 1536 1537declare void @void_func_void() #3 1538 1539declare i32 @llvm.amdgcn.workgroup.id.x() #1 1540declare i32 @llvm.amdgcn.workgroup.id.y() #1 1541declare i32 @llvm.amdgcn.workitem.id.x() #1 1542declare i32 @llvm.amdgcn.workitem.id.y() #1 1543 1544declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) nounwind readnone 1545 1546declare void @llvm.amdgcn.s.barrier() #2 1547 1548attributes #0 = { nounwind } 1549attributes #1 = { nounwind readnone speculatable } 1550attributes #2 = { convergent nounwind } 1551attributes #3 = { nounwind noinline } 1552