1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope --check-prefix=CI %s 3; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-ALIGNED %s 4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-UNALIGNED %s 5 6; FIXME: We don't get cases where the address was an SGPR because we 7; get a copy to the address register for each one. 8 9@lds = addrspace(3) global [512 x float] undef, align 4 10@lds.f64 = addrspace(3) global [512 x double] undef, align 8 11 12define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 { 13; CI-LABEL: simple_read2_f32: 14; CI: ; %bb.0: 15; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 16; CI-NEXT: s_mov_b32 m0, -1 17; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 18; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 19; CI-NEXT: s_mov_b32 s3, 0xf000 20; CI-NEXT: s_mov_b32 s2, 0 21; CI-NEXT: s_waitcnt lgkmcnt(0) 22; CI-NEXT: v_add_f32_e32 v2, v1, v2 23; CI-NEXT: v_mov_b32_e32 v1, 0 24; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 25; CI-NEXT: s_endpgm 26; 27; GFX9-LABEL: simple_read2_f32: 28; GFX9: ; %bb.0: 29; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 30; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8 31; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 32; GFX9-NEXT: s_waitcnt lgkmcnt(0) 33; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 34; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 35; GFX9-NEXT: s_endpgm 36 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 37 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 38 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 39 %add.x = add nsw i32 %x.i, 8 40 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 41 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 42 %sum = fadd float %val0, %val1 43 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 44 store float %sum, float addrspace(1)* %out.gep, align 4 45 ret void 46} 47 48define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 { 49; CI-LABEL: simple_read2_f32_max_offset: 50; CI: ; %bb.0: 51; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 52; CI-NEXT: s_mov_b32 m0, -1 53; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:255 54; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 55; CI-NEXT: s_mov_b32 s3, 0xf000 56; CI-NEXT: s_mov_b32 s2, 0 57; CI-NEXT: s_waitcnt lgkmcnt(0) 58; CI-NEXT: v_add_f32_e32 v2, v1, v2 59; CI-NEXT: v_mov_b32_e32 v1, 0 60; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 61; CI-NEXT: s_endpgm 62; 63; GFX9-LABEL: simple_read2_f32_max_offset: 64; GFX9: ; %bb.0: 65; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 66; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:255 67; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 68; GFX9-NEXT: s_waitcnt lgkmcnt(0) 69; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 70; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 71; GFX9-NEXT: s_endpgm 72 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 73 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 74 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 75 %add.x = add nsw i32 %x.i, 255 76 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 77 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 78 %sum = fadd float %val0, %val1 79 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 80 store float %sum, float addrspace(1)* %out.gep, align 4 81 ret void 82} 83 84define amdgpu_kernel void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 { 85; CI-LABEL: simple_read2_f32_too_far: 86; CI: ; %bb.0: 87; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 88; CI-NEXT: s_mov_b32 m0, -1 89; CI-NEXT: ds_read_b32 v1, v0 90; CI-NEXT: ds_read_b32 v2, v0 offset:1028 91; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 92; CI-NEXT: s_mov_b32 s3, 0xf000 93; CI-NEXT: s_mov_b32 s2, 0 94; CI-NEXT: s_waitcnt lgkmcnt(0) 95; CI-NEXT: v_add_f32_e32 v2, v1, v2 96; CI-NEXT: v_mov_b32_e32 v1, 0 97; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 98; CI-NEXT: s_endpgm 99; 100; GFX9-LABEL: simple_read2_f32_too_far: 101; GFX9: ; %bb.0: 102; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 103; GFX9-NEXT: ds_read_b32 v1, v0 104; GFX9-NEXT: ds_read_b32 v2, v0 offset:1028 105; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 106; GFX9-NEXT: s_waitcnt lgkmcnt(0) 107; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 108; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 109; GFX9-NEXT: s_endpgm 110 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 111 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 112 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 113 %add.x = add nsw i32 %x.i, 257 114 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 115 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 116 %sum = fadd float %val0, %val1 117 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 118 store float %sum, float addrspace(1)* %out.gep, align 4 119 ret void 120} 121 122define amdgpu_kernel void @simple_read2_f32_x2(float addrspace(1)* %out) #0 { 123; CI-LABEL: simple_read2_f32_x2: 124; CI: ; %bb.0: 125; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 126; CI-NEXT: s_mov_b32 m0, -1 127; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 128; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 129; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 130; CI-NEXT: s_mov_b32 s3, 0xf000 131; CI-NEXT: s_mov_b32 s2, 0 132; CI-NEXT: s_waitcnt lgkmcnt(0) 133; CI-NEXT: v_add_f32_e32 v1, v1, v2 134; CI-NEXT: v_add_f32_e32 v2, v3, v4 135; CI-NEXT: v_add_f32_e32 v2, v1, v2 136; CI-NEXT: v_mov_b32_e32 v1, 0 137; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 138; CI-NEXT: s_endpgm 139; 140; GFX9-LABEL: simple_read2_f32_x2: 141; GFX9: ; %bb.0: 142; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 143; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset1:8 144; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 145; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 146; GFX9-NEXT: s_waitcnt lgkmcnt(0) 147; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 148; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 149; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 150; GFX9-NEXT: global_store_dword v4, v0, s[0:1] 151; GFX9-NEXT: s_endpgm 152 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 153 %idx.0 = add nsw i32 %tid.x, 0 154 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 155 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 156 157 %idx.1 = add nsw i32 %tid.x, 8 158 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 159 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 160 %sum.0 = fadd float %val0, %val1 161 162 %idx.2 = add nsw i32 %tid.x, 11 163 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 164 %val2 = load float, float addrspace(3)* %arrayidx2, align 4 165 166 %idx.3 = add nsw i32 %tid.x, 27 167 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 168 %val3 = load float, float addrspace(3)* %arrayidx3, align 4 169 %sum.1 = fadd float %val2, %val3 170 171 %sum = fadd float %sum.0, %sum.1 172 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0 173 store float %sum, float addrspace(1)* %out.gep, align 4 174 ret void 175} 176 177; Make sure there is an instruction between the two sets of reads. 178define amdgpu_kernel void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 { 179; CI-LABEL: simple_read2_f32_x2_barrier: 180; CI: ; %bb.0: 181; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 182; CI-NEXT: s_mov_b32 m0, -1 183; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 184; CI-NEXT: s_waitcnt lgkmcnt(0) 185; CI-NEXT: s_barrier 186; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 187; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 188; CI-NEXT: s_mov_b32 s3, 0xf000 189; CI-NEXT: v_add_f32_e32 v1, v1, v2 190; CI-NEXT: s_mov_b32 s2, 0 191; CI-NEXT: s_waitcnt lgkmcnt(0) 192; CI-NEXT: v_add_f32_e32 v2, v3, v4 193; CI-NEXT: v_add_f32_e32 v2, v1, v2 194; CI-NEXT: v_mov_b32_e32 v1, 0 195; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 196; CI-NEXT: s_endpgm 197; 198; GFX9-LABEL: simple_read2_f32_x2_barrier: 199; GFX9: ; %bb.0: 200; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 201; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset1:8 202; GFX9-NEXT: s_waitcnt lgkmcnt(0) 203; GFX9-NEXT: s_barrier 204; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 205; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 206; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 207; GFX9-NEXT: s_waitcnt lgkmcnt(0) 208; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 209; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 210; GFX9-NEXT: global_store_dword v4, v0, s[0:1] 211; GFX9-NEXT: s_endpgm 212 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 213 %idx.0 = add nsw i32 %tid.x, 0 214 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 215 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 216 217 %idx.1 = add nsw i32 %tid.x, 8 218 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 219 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 220 %sum.0 = fadd float %val0, %val1 221 222 call void @llvm.amdgcn.s.barrier() #2 223 224 %idx.2 = add nsw i32 %tid.x, 11 225 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 226 %val2 = load float, float addrspace(3)* %arrayidx2, align 4 227 228 %idx.3 = add nsw i32 %tid.x, 27 229 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 230 %val3 = load float, float addrspace(3)* %arrayidx3, align 4 231 %sum.1 = fadd float %val2, %val3 232 233 %sum = fadd float %sum.0, %sum.1 234 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0 235 store float %sum, float addrspace(1)* %out.gep, align 4 236 ret void 237} 238 239; For some reason adding something to the base address for the first 240; element results in only folding the inner pair. 241define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 { 242; CI-LABEL: simple_read2_f32_x2_nonzero_base: 243; CI: ; %bb.0: 244; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 245; CI-NEXT: s_mov_b32 m0, -1 246; CI-NEXT: ds_read2_b32 v[1:2], v0 offset0:2 offset1:8 247; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 248; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 249; CI-NEXT: s_mov_b32 s3, 0xf000 250; CI-NEXT: s_mov_b32 s2, 0 251; CI-NEXT: s_waitcnt lgkmcnt(0) 252; CI-NEXT: v_add_f32_e32 v1, v1, v2 253; CI-NEXT: v_add_f32_e32 v2, v3, v4 254; CI-NEXT: v_add_f32_e32 v2, v1, v2 255; CI-NEXT: v_mov_b32_e32 v1, 0 256; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:8 257; CI-NEXT: s_endpgm 258; 259; GFX9-LABEL: simple_read2_f32_x2_nonzero_base: 260; GFX9: ; %bb.0: 261; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 262; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset0:2 offset1:8 263; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 264; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 265; GFX9-NEXT: s_waitcnt lgkmcnt(0) 266; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 267; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 268; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 269; GFX9-NEXT: global_store_dword v4, v0, s[0:1] offset:8 270; GFX9-NEXT: s_endpgm 271 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 272 %idx.0 = add nsw i32 %tid.x, 2 273 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 274 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 275 276 %idx.1 = add nsw i32 %tid.x, 8 277 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 278 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 279 %sum.0 = fadd float %val0, %val1 280 281 %idx.2 = add nsw i32 %tid.x, 11 282 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 283 %val2 = load float, float addrspace(3)* %arrayidx2, align 4 284 285 %idx.3 = add nsw i32 %tid.x, 27 286 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 287 %val3 = load float, float addrspace(3)* %arrayidx3, align 4 288 %sum.1 = fadd float %val2, %val3 289 290 %sum = fadd float %sum.0, %sum.1 291 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0 292 store float %sum, float addrspace(1)* %out.gep, align 4 293 ret void 294} 295 296; Be careful of vectors of pointers. We don't know if the 2 pointers 297; in the vectors are really the same base, so this is not safe to 298; merge. 299; Base pointers come from different subregister of same super 300; register. We can't safely merge this. 301define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { 302; CI-LABEL: read2_ptr_is_subreg_arg_f32: 303; CI: ; %bb.0: 304; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 305; CI-NEXT: s_mov_b32 m0, -1 306; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 307; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 308; CI-NEXT: s_waitcnt lgkmcnt(0) 309; CI-NEXT: v_mov_b32_e32 v1, s2 310; CI-NEXT: v_mov_b32_e32 v2, s3 311; CI-NEXT: ds_read_b32 v1, v1 offset:32 312; CI-NEXT: ds_read_b32 v2, v2 313; CI-NEXT: s_mov_b32 s3, 0xf000 314; CI-NEXT: s_mov_b32 s2, 0 315; CI-NEXT: s_waitcnt lgkmcnt(0) 316; CI-NEXT: v_add_f32_e32 v2, v1, v2 317; CI-NEXT: v_mov_b32_e32 v1, 0 318; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 319; CI-NEXT: s_endpgm 320; 321; GFX9-LABEL: read2_ptr_is_subreg_arg_f32: 322; GFX9: ; %bb.0: 323; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 324; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 325; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 326; GFX9-NEXT: s_waitcnt lgkmcnt(0) 327; GFX9-NEXT: v_mov_b32_e32 v1, s2 328; GFX9-NEXT: v_mov_b32_e32 v2, s3 329; GFX9-NEXT: ds_read_b32 v1, v1 offset:32 330; GFX9-NEXT: ds_read_b32 v2, v2 331; GFX9-NEXT: s_waitcnt lgkmcnt(0) 332; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 333; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 334; GFX9-NEXT: s_endpgm 335 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 336 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 337 %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 338 %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1 339 %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 340 %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 341 %val0 = load float, float addrspace(3)* %gep.0, align 4 342 %val1 = load float, float addrspace(3)* %gep.1, align 4 343 %add.x = add nsw i32 %x.i, 8 344 %sum = fadd float %val0, %val1 345 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 346 store float %sum, float addrspace(1)* %out.gep, align 4 347 ret void 348} 349 350; Apply a constant scalar offset after the pointer vector extract. We 351; are rejecting merges that have the same, constant 0 offset, so make 352; sure we are really rejecting it because of the different 353; subregisters. 354define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { 355; CI-LABEL: read2_ptr_is_subreg_arg_offset_f32: 356; CI: ; %bb.0: 357; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 358; CI-NEXT: s_mov_b32 m0, -1 359; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 360; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 361; CI-NEXT: s_waitcnt lgkmcnt(0) 362; CI-NEXT: v_mov_b32_e32 v1, s2 363; CI-NEXT: v_mov_b32_e32 v2, s3 364; CI-NEXT: ds_read_b32 v1, v1 offset:32 365; CI-NEXT: ds_read_b32 v2, v2 offset:32 366; CI-NEXT: s_mov_b32 s3, 0xf000 367; CI-NEXT: s_mov_b32 s2, 0 368; CI-NEXT: s_waitcnt lgkmcnt(0) 369; CI-NEXT: v_add_f32_e32 v2, v1, v2 370; CI-NEXT: v_mov_b32_e32 v1, 0 371; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 372; CI-NEXT: s_endpgm 373; 374; GFX9-LABEL: read2_ptr_is_subreg_arg_offset_f32: 375; GFX9: ; %bb.0: 376; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 377; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 378; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 379; GFX9-NEXT: s_waitcnt lgkmcnt(0) 380; GFX9-NEXT: v_mov_b32_e32 v1, s2 381; GFX9-NEXT: v_mov_b32_e32 v2, s3 382; GFX9-NEXT: ds_read_b32 v1, v1 offset:32 383; GFX9-NEXT: ds_read_b32 v2, v2 offset:32 384; GFX9-NEXT: s_waitcnt lgkmcnt(0) 385; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 386; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 387; GFX9-NEXT: s_endpgm 388 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 389 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 390 %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 391 %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1 392 %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 393 %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 394 395 ; Apply an additional offset after the vector that will be more obviously folded. 396 %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8 397 398 %val0 = load float, float addrspace(3)* %gep.0, align 4 399 %val1 = load float, float addrspace(3)* %gep.1.offset, align 4 400 %add.x = add nsw i32 %x.i, 8 401 %sum = fadd float %val0, %val1 402 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 403 store float %sum, float addrspace(1)* %out.gep, align 4 404 ret void 405} 406 407define amdgpu_kernel void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 { 408; CI-LABEL: read2_ptr_is_subreg_f32: 409; CI: ; %bb.0: 410; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 411; CI-NEXT: s_mov_b32 m0, -1 412; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 413; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 414; CI-NEXT: s_mov_b32 s3, 0xf000 415; CI-NEXT: s_mov_b32 s2, 0 416; CI-NEXT: s_waitcnt lgkmcnt(0) 417; CI-NEXT: v_add_f32_e32 v2, v1, v2 418; CI-NEXT: v_mov_b32_e32 v1, 0 419; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 420; CI-NEXT: s_endpgm 421; 422; GFX9-LABEL: read2_ptr_is_subreg_f32: 423; GFX9: ; %bb.0: 424; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 425; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8 426; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 427; GFX9-NEXT: s_waitcnt lgkmcnt(0) 428; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 429; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 430; GFX9-NEXT: s_endpgm 431 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 432 %ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0 433 %ptr.1 = insertelement <2 x [512 x float] addrspace(3)*> %ptr.0, [512 x float] addrspace(3)* @lds, i32 1 434 %x.i.v.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 435 %x.i.v.1 = insertelement <2 x i32> %x.i.v.0, i32 %x.i, i32 1 436 %idx = add <2 x i32> %x.i.v.1, <i32 0, i32 8> 437 %gep = getelementptr inbounds [512 x float], <2 x [512 x float] addrspace(3)*> %ptr.1, <2 x i32> <i32 0, i32 0>, <2 x i32> %idx 438 %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 439 %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 440 %val0 = load float, float addrspace(3)* %gep.0, align 4 441 %val1 = load float, float addrspace(3)* %gep.1, align 4 442 %add.x = add nsw i32 %x.i, 8 443 %sum = fadd float %val0, %val1 444 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 445 store float %sum, float addrspace(1)* %out.gep, align 4 446 ret void 447} 448 449define amdgpu_kernel void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 { 450; CI-LABEL: simple_read2_f32_volatile_0: 451; CI: ; %bb.0: 452; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 453; CI-NEXT: s_mov_b32 m0, -1 454; CI-NEXT: ds_read_b32 v1, v0 455; CI-NEXT: ds_read_b32 v2, v0 offset:32 456; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 457; CI-NEXT: s_mov_b32 s3, 0xf000 458; CI-NEXT: s_mov_b32 s2, 0 459; CI-NEXT: s_waitcnt lgkmcnt(0) 460; CI-NEXT: v_add_f32_e32 v2, v1, v2 461; CI-NEXT: v_mov_b32_e32 v1, 0 462; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 463; CI-NEXT: s_endpgm 464; 465; GFX9-LABEL: simple_read2_f32_volatile_0: 466; GFX9: ; %bb.0: 467; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 468; GFX9-NEXT: ds_read_b32 v1, v0 469; GFX9-NEXT: ds_read_b32 v2, v0 offset:32 470; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 471; GFX9-NEXT: s_waitcnt lgkmcnt(0) 472; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 473; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 474; GFX9-NEXT: s_endpgm 475 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 476 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 477 %val0 = load volatile float, float addrspace(3)* %arrayidx0, align 4 478 %add.x = add nsw i32 %x.i, 8 479 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 480 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 481 %sum = fadd float %val0, %val1 482 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 483 store float %sum, float addrspace(1)* %out.gep, align 4 484 ret void 485} 486 487define amdgpu_kernel void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 { 488; CI-LABEL: simple_read2_f32_volatile_1: 489; CI: ; %bb.0: 490; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 491; CI-NEXT: s_mov_b32 m0, -1 492; CI-NEXT: ds_read_b32 v1, v0 493; CI-NEXT: ds_read_b32 v2, v0 offset:32 494; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 495; CI-NEXT: s_mov_b32 s3, 0xf000 496; CI-NEXT: s_mov_b32 s2, 0 497; CI-NEXT: s_waitcnt lgkmcnt(0) 498; CI-NEXT: v_add_f32_e32 v2, v1, v2 499; CI-NEXT: v_mov_b32_e32 v1, 0 500; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 501; CI-NEXT: s_endpgm 502; 503; GFX9-LABEL: simple_read2_f32_volatile_1: 504; GFX9: ; %bb.0: 505; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 506; GFX9-NEXT: ds_read_b32 v1, v0 507; GFX9-NEXT: ds_read_b32 v2, v0 offset:32 508; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 509; GFX9-NEXT: s_waitcnt lgkmcnt(0) 510; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 511; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 512; GFX9-NEXT: s_endpgm 513 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 514 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 515 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 516 %add.x = add nsw i32 %x.i, 8 517 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 518 %val1 = load volatile float, float addrspace(3)* %arrayidx1, align 4 519 %sum = fadd float %val0, %val1 520 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 521 store float %sum, float addrspace(1)* %out.gep, align 4 522 ret void 523} 524 525; Can't fold since not correctly aligned. 526define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { 527; CI-LABEL: unaligned_read2_f32: 528; CI: ; %bb.0: 529; CI-NEXT: s_load_dword s2, s[0:1], 0x2 530; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 531; CI-NEXT: s_mov_b32 m0, -1 532; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 533; CI-NEXT: s_mov_b32 s3, 0xf000 534; CI-NEXT: s_waitcnt lgkmcnt(0) 535; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 536; CI-NEXT: ds_read_u8 v2, v1 offset:34 537; CI-NEXT: ds_read_u8 v3, v1 offset:32 538; CI-NEXT: ds_read_u8 v4, v1 offset:3 539; CI-NEXT: ds_read_u8 v5, v1 offset:2 540; CI-NEXT: ds_read_u8 v6, v1 offset:1 541; CI-NEXT: ds_read_u8 v7, v1 542; CI-NEXT: ds_read_u8 v8, v1 offset:33 543; CI-NEXT: ds_read_u8 v1, v1 offset:35 544; CI-NEXT: s_waitcnt lgkmcnt(5) 545; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 546; CI-NEXT: s_waitcnt lgkmcnt(3) 547; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 548; CI-NEXT: v_or_b32_e32 v4, v4, v5 549; CI-NEXT: s_waitcnt lgkmcnt(1) 550; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 551; CI-NEXT: s_waitcnt lgkmcnt(0) 552; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 553; CI-NEXT: v_or_b32_e32 v1, v1, v2 554; CI-NEXT: v_or_b32_e32 v6, v6, v7 555; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 556; CI-NEXT: v_or_b32_e32 v3, v5, v3 557; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 558; CI-NEXT: v_or_b32_e32 v4, v4, v6 559; CI-NEXT: v_or_b32_e32 v1, v1, v3 560; CI-NEXT: v_add_f32_e32 v2, v4, v1 561; CI-NEXT: s_mov_b32 s2, 0 562; CI-NEXT: v_mov_b32_e32 v1, 0 563; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 564; CI-NEXT: s_endpgm 565; 566; GFX9-ALIGNED-LABEL: unaligned_read2_f32: 567; GFX9-ALIGNED: ; %bb.0: 568; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x8 569; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 570; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 571; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 572; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 573; GFX9-ALIGNED-NEXT: ds_read_u8 v2, v1 574; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v1 offset:1 575; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v1 offset:2 576; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v1 offset:3 577; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v1 offset:32 578; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v1 offset:33 579; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v1 offset:34 580; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v1 offset:35 581; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(6) 582; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 8, v2 583; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(4) 584; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v4 585; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 586; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(2) 587; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v6 588; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 589; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v8 590; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v3 591; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 592; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[2:3] 593; GFX9-ALIGNED-NEXT: s_endpgm 594; 595; GFX9-UNALIGNED-LABEL: unaligned_read2_f32: 596; GFX9-UNALIGNED: ; %bb.0: 597; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8 598; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 599; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 600; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 601; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s2, v2 602; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 603; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 604; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 605; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] 606; GFX9-UNALIGNED-NEXT: s_endpgm 607 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 608 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i 609 %val0 = load float, float addrspace(3)* %arrayidx0, align 1 610 %add.x = add nsw i32 %x.i, 8 611 %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x 612 %val1 = load float, float addrspace(3)* %arrayidx1, align 1 613 %sum = fadd float %val0, %val1 614 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 615 store float %sum, float addrspace(1)* %out.gep, align 4 616 ret void 617} 618 619define amdgpu_kernel void @unaligned_offset_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { 620; CI-LABEL: unaligned_offset_read2_f32: 621; CI: ; %bb.0: 622; CI-NEXT: s_load_dword s2, s[0:1], 0x2 623; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 624; CI-NEXT: s_mov_b32 m0, -1 625; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 626; CI-NEXT: s_mov_b32 s3, 0xf000 627; CI-NEXT: s_waitcnt lgkmcnt(0) 628; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 629; CI-NEXT: ds_read_u8 v2, v1 offset:11 630; CI-NEXT: ds_read_u8 v3, v1 offset:9 631; CI-NEXT: ds_read_u8 v4, v1 offset:8 632; CI-NEXT: ds_read_u8 v5, v1 offset:7 633; CI-NEXT: ds_read_u8 v6, v1 offset:6 634; CI-NEXT: ds_read_u8 v7, v1 offset:5 635; CI-NEXT: ds_read_u8 v8, v1 offset:10 636; CI-NEXT: ds_read_u8 v1, v1 offset:12 637; CI-NEXT: s_waitcnt lgkmcnt(5) 638; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 639; CI-NEXT: s_waitcnt lgkmcnt(3) 640; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 641; CI-NEXT: v_or_b32_e32 v4, v4, v5 642; CI-NEXT: s_waitcnt lgkmcnt(1) 643; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 644; CI-NEXT: s_waitcnt lgkmcnt(0) 645; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 646; CI-NEXT: v_or_b32_e32 v1, v1, v2 647; CI-NEXT: v_or_b32_e32 v6, v6, v7 648; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 649; CI-NEXT: v_or_b32_e32 v3, v5, v3 650; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 651; CI-NEXT: v_or_b32_e32 v4, v4, v6 652; CI-NEXT: v_or_b32_e32 v1, v1, v3 653; CI-NEXT: v_add_f32_e32 v2, v4, v1 654; CI-NEXT: s_mov_b32 s2, 0 655; CI-NEXT: v_mov_b32_e32 v1, 0 656; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 657; CI-NEXT: s_endpgm 658; 659; GFX9-ALIGNED-LABEL: unaligned_offset_read2_f32: 660; GFX9-ALIGNED: ; %bb.0: 661; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x8 662; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 663; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 664; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 665; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 666; GFX9-ALIGNED-NEXT: ds_read_u8 v2, v1 offset:5 667; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v1 offset:6 668; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v1 offset:7 669; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v1 offset:8 670; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v1 offset:9 671; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v1 offset:10 672; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v1 offset:11 673; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v1 offset:12 674; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(6) 675; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 8, v2 676; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(4) 677; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v4 678; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 679; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(2) 680; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v6 681; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 682; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v8 683; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v3 684; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 685; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[2:3] 686; GFX9-ALIGNED-NEXT: s_endpgm 687; 688; GFX9-UNALIGNED-LABEL: unaligned_offset_read2_f32: 689; GFX9-UNALIGNED: ; %bb.0: 690; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8 691; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 692; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 693; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 694; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s2, v2 695; GFX9-UNALIGNED-NEXT: ds_read_b64 v[0:1], v0 offset:5 696; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 697; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 698; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] 699; GFX9-UNALIGNED-NEXT: s_endpgm 700 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 701 %base = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i 702 %base.i8 = bitcast float addrspace(3)* %base to i8 addrspace(3)* 703 %addr0.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 5 704 %addr0 = bitcast i8 addrspace(3)* %addr0.i8 to float addrspace(3)* 705 %val0 = load float, float addrspace(3)* %addr0, align 1 706 %addr1.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 9 707 %addr1 = bitcast i8 addrspace(3)* %addr1.i8 to float addrspace(3)* 708 %val1 = load float, float addrspace(3)* %addr1, align 1 709 %sum = fadd float %val0, %val1 710 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 711 store float %sum, float addrspace(1)* %out.gep, align 4 712 ret void 713} 714 715define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { 716; CI-LABEL: misaligned_2_simple_read2_f32: 717; CI: ; %bb.0: 718; CI-NEXT: s_load_dword s2, s[0:1], 0x2 719; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 720; CI-NEXT: s_mov_b32 m0, -1 721; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 722; CI-NEXT: s_mov_b32 s3, 0xf000 723; CI-NEXT: s_waitcnt lgkmcnt(0) 724; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 725; CI-NEXT: ds_read_u16 v2, v1 offset:32 726; CI-NEXT: ds_read_u16 v3, v1 offset:2 727; CI-NEXT: ds_read_u16 v4, v1 728; CI-NEXT: ds_read_u16 v1, v1 offset:34 729; CI-NEXT: s_mov_b32 s2, 0 730; CI-NEXT: s_waitcnt lgkmcnt(2) 731; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 732; CI-NEXT: s_waitcnt lgkmcnt(1) 733; CI-NEXT: v_or_b32_e32 v3, v3, v4 734; CI-NEXT: s_waitcnt lgkmcnt(0) 735; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 736; CI-NEXT: v_or_b32_e32 v1, v1, v2 737; CI-NEXT: v_add_f32_e32 v2, v3, v1 738; CI-NEXT: v_mov_b32_e32 v1, 0 739; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 740; CI-NEXT: s_endpgm 741; 742; GFX9-ALIGNED-LABEL: misaligned_2_simple_read2_f32: 743; GFX9-ALIGNED: ; %bb.0: 744; GFX9-ALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8 745; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 746; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 747; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 748; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s2, v0 749; GFX9-ALIGNED-NEXT: ds_read_u16 v2, v1 750; GFX9-ALIGNED-NEXT: ds_read_u16 v3, v1 offset:2 751; GFX9-ALIGNED-NEXT: ds_read_u16 v4, v1 offset:32 752; GFX9-ALIGNED-NEXT: ds_read_u16 v1, v1 offset:34 753; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(2) 754; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 755; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 756; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v4 757; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 758; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[0:1] 759; GFX9-ALIGNED-NEXT: s_endpgm 760; 761; GFX9-UNALIGNED-LABEL: misaligned_2_simple_read2_f32: 762; GFX9-UNALIGNED: ; %bb.0: 763; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8 764; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 765; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 766; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 767; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s2, v2 768; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 769; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 770; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 771; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] 772; GFX9-UNALIGNED-NEXT: s_endpgm 773 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 774 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i 775 %val0 = load float, float addrspace(3)* %arrayidx0, align 2 776 %add.x = add nsw i32 %x.i, 8 777 %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x 778 %val1 = load float, float addrspace(3)* %arrayidx1, align 2 779 %sum = fadd float %val0, %val1 780 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 781 store float %sum, float addrspace(1)* %out.gep, align 4 782 ret void 783} 784 785define amdgpu_kernel void @simple_read2_f64(double addrspace(1)* %out) #0 { 786; CI-LABEL: simple_read2_f64: 787; CI: ; %bb.0: 788; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 789; CI-NEXT: s_mov_b32 m0, -1 790; CI-NEXT: ds_read2_b64 v[0:3], v4 offset1:8 791; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 792; CI-NEXT: s_mov_b32 s3, 0xf000 793; CI-NEXT: s_mov_b32 s2, 0 794; CI-NEXT: v_mov_b32_e32 v5, 0 795; CI-NEXT: s_waitcnt lgkmcnt(0) 796; CI-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 797; CI-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64 798; CI-NEXT: s_endpgm 799; 800; GFX9-LABEL: simple_read2_f64: 801; GFX9: ; %bb.0: 802; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 803; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:8 804; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 805; GFX9-NEXT: s_waitcnt lgkmcnt(0) 806; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 807; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 808; GFX9-NEXT: s_endpgm 809 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 810 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i 811 %val0 = load double, double addrspace(3)* %arrayidx0, align 8 812 %add.x = add nsw i32 %x.i, 8 813 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x 814 %val1 = load double, double addrspace(3)* %arrayidx1, align 8 815 %sum = fadd double %val0, %val1 816 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i 817 store double %sum, double addrspace(1)* %out.gep, align 8 818 ret void 819} 820 821define amdgpu_kernel void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 { 822; CI-LABEL: simple_read2_f64_max_offset: 823; CI: ; %bb.0: 824; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 825; CI-NEXT: s_mov_b32 m0, -1 826; CI-NEXT: ds_read2_b64 v[0:3], v4 offset1:255 827; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 828; CI-NEXT: s_mov_b32 s3, 0xf000 829; CI-NEXT: s_mov_b32 s2, 0 830; CI-NEXT: v_mov_b32_e32 v5, 0 831; CI-NEXT: s_waitcnt lgkmcnt(0) 832; CI-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 833; CI-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64 834; CI-NEXT: s_endpgm 835; 836; GFX9-LABEL: simple_read2_f64_max_offset: 837; GFX9: ; %bb.0: 838; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 839; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:255 840; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 841; GFX9-NEXT: s_waitcnt lgkmcnt(0) 842; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 843; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 844; GFX9-NEXT: s_endpgm 845 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 846 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i 847 %val0 = load double, double addrspace(3)* %arrayidx0, align 8 848 %add.x = add nsw i32 %x.i, 255 849 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x 850 %val1 = load double, double addrspace(3)* %arrayidx1, align 8 851 %sum = fadd double %val0, %val1 852 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i 853 store double %sum, double addrspace(1)* %out.gep, align 8 854 ret void 855} 856 857define amdgpu_kernel void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 { 858; CI-LABEL: simple_read2_f64_too_far: 859; CI: ; %bb.0: 860; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 861; CI-NEXT: s_mov_b32 m0, -1 862; CI-NEXT: ds_read_b64 v[1:2], v0 863; CI-NEXT: ds_read_b64 v[3:4], v0 offset:2056 864; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 865; CI-NEXT: s_mov_b32 s3, 0xf000 866; CI-NEXT: s_mov_b32 s2, 0 867; CI-NEXT: s_waitcnt lgkmcnt(0) 868; CI-NEXT: v_add_f64 v[2:3], v[1:2], v[3:4] 869; CI-NEXT: v_mov_b32_e32 v1, 0 870; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 871; CI-NEXT: s_endpgm 872; 873; GFX9-LABEL: simple_read2_f64_too_far: 874; GFX9: ; %bb.0: 875; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 876; GFX9-NEXT: ds_read_b64 v[0:1], v4 877; GFX9-NEXT: ds_read_b64 v[2:3], v4 offset:2056 878; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 879; GFX9-NEXT: s_waitcnt lgkmcnt(0) 880; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 881; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 882; GFX9-NEXT: s_endpgm 883 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 884 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i 885 %val0 = load double, double addrspace(3)* %arrayidx0, align 8 886 %add.x = add nsw i32 %x.i, 257 887 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x 888 %val1 = load double, double addrspace(3)* %arrayidx1, align 8 889 %sum = fadd double %val0, %val1 890 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i 891 store double %sum, double addrspace(1)* %out.gep, align 8 892 ret void 893} 894 895; Alignment only 4 896define amdgpu_kernel void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { 897; CI-LABEL: misaligned_read2_f64: 898; CI: ; %bb.0: 899; CI-NEXT: s_load_dword s2, s[0:1], 0x2 900; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 901; CI-NEXT: s_mov_b32 m0, -1 902; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 903; CI-NEXT: s_mov_b32 s3, 0xf000 904; CI-NEXT: s_waitcnt lgkmcnt(0) 905; CI-NEXT: v_add_i32_e32 v3, vcc, s2, v0 906; CI-NEXT: ds_read2_b32 v[1:2], v3 offset1:1 907; CI-NEXT: ds_read2_b32 v[3:4], v3 offset0:14 offset1:15 908; CI-NEXT: s_mov_b32 s2, 0 909; CI-NEXT: s_waitcnt lgkmcnt(0) 910; CI-NEXT: v_add_f64 v[2:3], v[1:2], v[3:4] 911; CI-NEXT: v_mov_b32_e32 v1, 0 912; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 913; CI-NEXT: s_endpgm 914; 915; GFX9-LABEL: misaligned_read2_f64: 916; GFX9: ; %bb.0: 917; GFX9-NEXT: s_load_dword s2, s[0:1], 0x8 918; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 919; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 920; GFX9-NEXT: s_waitcnt lgkmcnt(0) 921; GFX9-NEXT: v_add_u32_e32 v2, s2, v4 922; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 923; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:14 offset1:15 924; GFX9-NEXT: s_waitcnt lgkmcnt(0) 925; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 926; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 927; GFX9-NEXT: s_endpgm 928 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 929 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i 930 %val0 = load double, double addrspace(3)* %arrayidx0, align 4 931 %add.x = add nsw i32 %x.i, 7 932 %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x 933 %val1 = load double, double addrspace(3)* %arrayidx1, align 4 934 %sum = fadd double %val0, %val1 935 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i 936 store double %sum, double addrspace(1)* %out.gep, align 4 937 ret void 938} 939 940@foo = addrspace(3) global [4 x i32] undef, align 4 941 942define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) { 943; CI-LABEL: load_constant_adjacent_offsets: 944; CI: ; %bb.0: 945; CI-NEXT: v_mov_b32_e32 v0, 0 946; CI-NEXT: s_mov_b32 m0, -1 947; CI-NEXT: ds_read_b64 v[0:1], v0 948; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 949; CI-NEXT: s_mov_b32 s3, 0xf000 950; CI-NEXT: s_mov_b32 s2, -1 951; CI-NEXT: s_waitcnt lgkmcnt(0) 952; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 953; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 954; CI-NEXT: s_endpgm 955; 956; GFX9-LABEL: load_constant_adjacent_offsets: 957; GFX9: ; %bb.0: 958; GFX9-NEXT: v_mov_b32_e32 v2, 0 959; GFX9-NEXT: ds_read_b64 v[0:1], v2 960; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 961; GFX9-NEXT: s_waitcnt lgkmcnt(0) 962; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 963; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 964; GFX9-NEXT: s_endpgm 965 %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 966 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 967 %sum = add i32 %val0, %val1 968 store i32 %sum, i32 addrspace(1)* %out, align 4 969 ret void 970} 971 972define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) { 973; CI-LABEL: load_constant_disjoint_offsets: 974; CI: ; %bb.0: 975; CI-NEXT: v_mov_b32_e32 v0, 0 976; CI-NEXT: s_mov_b32 m0, -1 977; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:2 978; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 979; CI-NEXT: s_mov_b32 s3, 0xf000 980; CI-NEXT: s_mov_b32 s2, -1 981; CI-NEXT: s_waitcnt lgkmcnt(0) 982; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 983; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 984; CI-NEXT: s_endpgm 985; 986; GFX9-LABEL: load_constant_disjoint_offsets: 987; GFX9: ; %bb.0: 988; GFX9-NEXT: v_mov_b32_e32 v2, 0 989; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:2 990; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 991; GFX9-NEXT: s_waitcnt lgkmcnt(0) 992; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 993; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 994; GFX9-NEXT: s_endpgm 995 %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 996 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 997 %sum = add i32 %val0, %val1 998 store i32 %sum, i32 addrspace(1)* %out, align 4 999 ret void 1000} 1001 1002@bar = addrspace(3) global [4 x i64] undef, align 4 1003 1004define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) { 1005; CI-LABEL: load_misaligned64_constant_offsets: 1006; CI: ; %bb.0: 1007; CI-NEXT: v_mov_b32_e32 v0, 0 1008; CI-NEXT: s_mov_b32 m0, -1 1009; CI-NEXT: ds_read_b128 v[0:3], v0 1010; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 1011; CI-NEXT: s_mov_b32 s3, 0xf000 1012; CI-NEXT: s_mov_b32 s2, -1 1013; CI-NEXT: s_waitcnt lgkmcnt(0) 1014; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1015; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 1016; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1017; CI-NEXT: s_endpgm 1018; 1019; GFX9-LABEL: load_misaligned64_constant_offsets: 1020; GFX9: ; %bb.0: 1021; GFX9-NEXT: v_mov_b32_e32 v4, 0 1022; GFX9-NEXT: ds_read_b128 v[0:3], v4 1023; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 1024; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1025; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 1026; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 1027; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 1028; GFX9-NEXT: s_endpgm 1029 %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 1030 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 1031 %sum = add i64 %val0, %val1 1032 store i64 %sum, i64 addrspace(1)* %out, align 8 1033 ret void 1034} 1035 1036@bar.large = addrspace(3) global [4096 x i64] undef, align 4 1037 1038define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) { 1039; CI-LABEL: load_misaligned64_constant_large_offsets: 1040; CI: ; %bb.0: 1041; CI-NEXT: v_mov_b32_e32 v2, 0 1042; CI-NEXT: s_mov_b32 m0, -1 1043; CI-NEXT: ds_read_b64 v[0:1], v2 offset:16384 1044; CI-NEXT: ds_read_b64 v[2:3], v2 offset:32760 1045; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 1046; CI-NEXT: s_mov_b32 s3, 0xf000 1047; CI-NEXT: s_mov_b32 s2, -1 1048; CI-NEXT: s_waitcnt lgkmcnt(0) 1049; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1050; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 1051; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1052; CI-NEXT: s_endpgm 1053; 1054; GFX9-LABEL: load_misaligned64_constant_large_offsets: 1055; GFX9: ; %bb.0: 1056; GFX9-NEXT: v_mov_b32_e32 v4, 0 1057; GFX9-NEXT: ds_read_b64 v[0:1], v4 offset:16384 1058; GFX9-NEXT: ds_read_b64 v[2:3], v4 offset:32760 1059; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 1060; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1061; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 1062; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 1063; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 1064; GFX9-NEXT: s_endpgm 1065 %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 1066 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 1067 %sum = add i64 %val0, %val1 1068 store i64 %sum, i64 addrspace(1)* %out, align 8 1069 ret void 1070} 1071 1072@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4 1073@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4 1074 1075define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 { 1076; CI-LABEL: sgemm_inner_loop_read2_sequence: 1077; CI: ; %bb.0: 1078; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 1079; CI-NEXT: s_lshl_b32 s0, s2, 2 1080; CI-NEXT: s_add_i32 s1, s0, 0xc20 1081; CI-NEXT: s_addk_i32 s0, 0xc60 1082; CI-NEXT: v_mov_b32_e32 v0, s1 1083; CI-NEXT: v_mov_b32_e32 v2, s0 1084; CI-NEXT: v_lshlrev_b32_e32 v8, 2, v1 1085; CI-NEXT: s_mov_b32 m0, -1 1086; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 1087; CI-NEXT: ds_read2_b32 v[2:3], v2 offset1:1 1088; CI-NEXT: ds_read2_b32 v[4:5], v8 offset1:1 1089; CI-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33 1090; CI-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65 1091; CI-NEXT: s_waitcnt lgkmcnt(0) 1092; CI-NEXT: v_add_f32_e32 v0, v0, v1 1093; CI-NEXT: v_add_f32_e32 v0, v0, v2 1094; CI-NEXT: v_add_f32_e32 v0, v0, v3 1095; CI-NEXT: v_add_f32_e32 v0, v0, v4 1096; CI-NEXT: v_add_f32_e32 v0, v0, v5 1097; CI-NEXT: v_add_f32_e32 v0, v0, v6 1098; CI-NEXT: v_add_f32_e32 v0, v0, v7 1099; CI-NEXT: v_add_f32_e32 v0, v0, v8 1100; CI-NEXT: s_mov_b32 s7, 0xf000 1101; CI-NEXT: s_mov_b32 s6, -1 1102; CI-NEXT: v_add_f32_e32 v0, v0, v9 1103; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1104; CI-NEXT: s_endpgm 1105; 1106; GFX9-LABEL: sgemm_inner_loop_read2_sequence: 1107; GFX9: ; %bb.0: 1108; GFX9-NEXT: s_lshl_b32 s2, s2, 2 1109; GFX9-NEXT: s_add_i32 s3, s2, 0xc20 1110; GFX9-NEXT: s_addk_i32 s2, 0xc60 1111; GFX9-NEXT: v_mov_b32_e32 v0, s3 1112; GFX9-NEXT: v_mov_b32_e32 v2, s2 1113; GFX9-NEXT: v_lshlrev_b32_e32 v8, 2, v1 1114; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 1115; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset1:1 1116; GFX9-NEXT: ds_read2_b32 v[4:5], v8 offset1:1 1117; GFX9-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33 1118; GFX9-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65 1119; GFX9-NEXT: s_waitcnt lgkmcnt(4) 1120; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 1121; GFX9-NEXT: s_waitcnt lgkmcnt(3) 1122; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 1123; GFX9-NEXT: v_add_f32_e32 v0, v0, v3 1124; GFX9-NEXT: s_waitcnt lgkmcnt(2) 1125; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 1126; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 1127; GFX9-NEXT: v_add_f32_e32 v0, v0, v5 1128; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1129; GFX9-NEXT: v_add_f32_e32 v0, v0, v6 1130; GFX9-NEXT: v_add_f32_e32 v0, v0, v7 1131; GFX9-NEXT: v_add_f32_e32 v0, v0, v8 1132; GFX9-NEXT: v_mov_b32_e32 v10, 0 1133; GFX9-NEXT: v_add_f32_e32 v0, v0, v9 1134; GFX9-NEXT: global_store_dword v10, v0, s[0:1] 1135; GFX9-NEXT: s_endpgm 1136 %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1 1137 %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1 1138 %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i 1139 %tmp16 = load float, float addrspace(3)* %arrayidx44, align 4 1140 %add47 = add nsw i32 %x.i, 1 1141 %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47 1142 %tmp17 = load float, float addrspace(3)* %arrayidx48, align 4 1143 %add51 = add nsw i32 %x.i, 16 1144 %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51 1145 %tmp18 = load float, float addrspace(3)* %arrayidx52, align 4 1146 %add55 = add nsw i32 %x.i, 17 1147 %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55 1148 %tmp19 = load float, float addrspace(3)* %arrayidx56, align 4 1149 %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i 1150 %tmp20 = load float, float addrspace(3)* %arrayidx60, align 4 1151 %add63 = add nsw i32 %y.i, 1 1152 %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63 1153 %tmp21 = load float, float addrspace(3)* %arrayidx64, align 4 1154 %add67 = add nsw i32 %y.i, 32 1155 %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67 1156 %tmp22 = load float, float addrspace(3)* %arrayidx68, align 4 1157 %add71 = add nsw i32 %y.i, 33 1158 %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71 1159 %tmp23 = load float, float addrspace(3)* %arrayidx72, align 4 1160 %add75 = add nsw i32 %y.i, 64 1161 %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75 1162 %tmp24 = load float, float addrspace(3)* %arrayidx76, align 4 1163 %add79 = add nsw i32 %y.i, 65 1164 %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79 1165 %tmp25 = load float, float addrspace(3)* %arrayidx80, align 4 1166 %sum.0 = fadd float %tmp16, %tmp17 1167 %sum.1 = fadd float %sum.0, %tmp18 1168 %sum.2 = fadd float %sum.1, %tmp19 1169 %sum.3 = fadd float %sum.2, %tmp20 1170 %sum.4 = fadd float %sum.3, %tmp21 1171 %sum.5 = fadd float %sum.4, %tmp22 1172 %sum.6 = fadd float %sum.5, %tmp23 1173 %sum.7 = fadd float %sum.6, %tmp24 1174 %sum.8 = fadd float %sum.7, %tmp25 1175 store float %sum.8, float addrspace(1)* %C, align 4 1176 ret void 1177} 1178 1179define amdgpu_kernel void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 { 1180; CI-LABEL: misaligned_read2_v2i32: 1181; CI: ; %bb.0: 1182; CI-NEXT: s_load_dword s2, s[0:1], 0x2 1183; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 1184; CI-NEXT: s_mov_b32 m0, -1 1185; CI-NEXT: s_mov_b32 s3, 0xf000 1186; CI-NEXT: s_waitcnt lgkmcnt(0) 1187; CI-NEXT: v_mov_b32_e32 v0, s2 1188; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 1189; CI-NEXT: s_mov_b32 s2, -1 1190; CI-NEXT: s_waitcnt lgkmcnt(0) 1191; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1192; CI-NEXT: s_endpgm 1193; 1194; GFX9-LABEL: misaligned_read2_v2i32: 1195; GFX9: ; %bb.0: 1196; GFX9-NEXT: s_load_dword s4, s[0:1], 0x8 1197; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1198; GFX9-NEXT: v_mov_b32_e32 v2, 0 1199; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1200; GFX9-NEXT: v_mov_b32_e32 v0, s4 1201; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 1202; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1203; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 1204; GFX9-NEXT: s_endpgm 1205 %load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4 1206 store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8 1207 ret void 1208} 1209 1210define amdgpu_kernel void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 { 1211; CI-LABEL: misaligned_read2_i64: 1212; CI: ; %bb.0: 1213; CI-NEXT: s_load_dword s2, s[0:1], 0x2 1214; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 1215; CI-NEXT: s_mov_b32 m0, -1 1216; CI-NEXT: s_mov_b32 s3, 0xf000 1217; CI-NEXT: s_waitcnt lgkmcnt(0) 1218; CI-NEXT: v_mov_b32_e32 v0, s2 1219; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 1220; CI-NEXT: s_mov_b32 s2, -1 1221; CI-NEXT: s_waitcnt lgkmcnt(0) 1222; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1223; CI-NEXT: s_endpgm 1224; 1225; GFX9-LABEL: misaligned_read2_i64: 1226; GFX9: ; %bb.0: 1227; GFX9-NEXT: s_load_dword s4, s[0:1], 0x8 1228; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1229; GFX9-NEXT: v_mov_b32_e32 v2, 0 1230; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1231; GFX9-NEXT: v_mov_b32_e32 v0, s4 1232; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 1233; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1234; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 1235; GFX9-NEXT: s_endpgm 1236 %load = load i64, i64 addrspace(3)* %in, align 4 1237 store i64 %load, i64 addrspace(1)* %out, align 8 1238 ret void 1239} 1240 1241define amdgpu_kernel void @ds_read_diff_base_interleaving( 1242; CI-LABEL: ds_read_diff_base_interleaving: 1243; CI: ; %bb.0: ; %bb 1244; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2 1245; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 1246; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v1 1247; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1248; CI-NEXT: s_mov_b32 m0, -1 1249; CI-NEXT: s_waitcnt lgkmcnt(0) 1250; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v1 1251; CI-NEXT: v_add_i32_e32 v3, vcc, s5, v0 1252; CI-NEXT: v_add_i32_e32 v4, vcc, s6, v1 1253; CI-NEXT: v_add_i32_e32 v6, vcc, s7, v0 1254; CI-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 1255; CI-NEXT: ds_read2_b32 v[2:3], v3 offset1:4 1256; CI-NEXT: ds_read2_b32 v[4:5], v4 offset1:1 1257; CI-NEXT: ds_read2_b32 v[6:7], v6 offset1:4 1258; CI-NEXT: s_mov_b32 s3, 0xf000 1259; CI-NEXT: s_mov_b32 s2, -1 1260; CI-NEXT: s_waitcnt lgkmcnt(2) 1261; CI-NEXT: v_mul_f32_e32 v0, v0, v2 1262; CI-NEXT: v_add_f32_e32 v0, 2.0, v0 1263; CI-NEXT: s_waitcnt lgkmcnt(0) 1264; CI-NEXT: v_mul_f32_e32 v2, v4, v6 1265; CI-NEXT: v_sub_f32_e32 v0, v0, v2 1266; CI-NEXT: v_mul_f32_e32 v1, v1, v3 1267; CI-NEXT: v_sub_f32_e32 v0, v0, v1 1268; CI-NEXT: v_mul_f32_e32 v1, v5, v7 1269; CI-NEXT: v_sub_f32_e32 v0, v0, v1 1270; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:40 1271; CI-NEXT: s_endpgm 1272; 1273; GFX9-LABEL: ds_read_diff_base_interleaving: 1274; GFX9: ; %bb.0: ; %bb 1275; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 1276; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 1277; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1278; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 1279; GFX9-NEXT: v_mov_b32_e32 v8, 0 1280; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1281; GFX9-NEXT: v_add_u32_e32 v2, s4, v1 1282; GFX9-NEXT: v_add_u32_e32 v3, s5, v0 1283; GFX9-NEXT: v_add_u32_e32 v4, s6, v1 1284; GFX9-NEXT: v_add_u32_e32 v6, s7, v0 1285; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 1286; GFX9-NEXT: ds_read2_b32 v[2:3], v3 offset1:4 1287; GFX9-NEXT: ds_read2_b32 v[4:5], v4 offset1:1 1288; GFX9-NEXT: ds_read2_b32 v[6:7], v6 offset1:4 1289; GFX9-NEXT: s_waitcnt lgkmcnt(2) 1290; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 1291; GFX9-NEXT: v_add_f32_e32 v0, 2.0, v0 1292; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1293; GFX9-NEXT: v_mul_f32_e32 v2, v4, v6 1294; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 1295; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 1296; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 1297; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7 1298; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 1299; GFX9-NEXT: global_store_dword v8, v0, s[0:1] offset:40 1300; GFX9-NEXT: s_endpgm 1301 float addrspace(1)* nocapture %arg, 1302 [4 x [4 x float]] addrspace(3)* %arg1, 1303 [4 x [4 x float]] addrspace(3)* %arg2, 1304 [4 x [4 x float]] addrspace(3)* %arg3, 1305 [4 x [4 x float]] addrspace(3)* %arg4) #1 { 1306bb: 1307 %tmp = getelementptr float, float addrspace(1)* %arg, i64 10 1308 %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.x() #2 1309 %tmp6 = tail call i32 @llvm.amdgcn.workitem.id.y() #2 1310 %tmp7 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg1, i32 0, i32 %tmp6, i32 0 1311 %tmp8 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg2, i32 0, i32 0, i32 %tmp5 1312 %tmp9 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg3, i32 0, i32 %tmp6, i32 0 1313 %tmp10 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg4, i32 0, i32 0, i32 %tmp5 1314 %tmp11 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg1, i32 0, i32 %tmp6, i32 1 1315 %tmp12 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg2, i32 0, i32 1, i32 %tmp5 1316 %tmp13 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg3, i32 0, i32 %tmp6, i32 1 1317 %tmp14 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg4, i32 0, i32 1, i32 %tmp5 1318 %tmp15 = load float, float addrspace(3)* %tmp7 1319 %tmp16 = load float, float addrspace(3)* %tmp8 1320 %tmp17 = fmul float %tmp15, %tmp16 1321 %tmp18 = fadd float 2.000000e+00, %tmp17 1322 %tmp19 = load float, float addrspace(3)* %tmp9 1323 %tmp20 = load float, float addrspace(3)* %tmp10 1324 %tmp21 = fmul float %tmp19, %tmp20 1325 %tmp22 = fsub float %tmp18, %tmp21 1326 %tmp23 = load float, float addrspace(3)* %tmp11 1327 %tmp24 = load float, float addrspace(3)* %tmp12 1328 %tmp25 = fmul float %tmp23, %tmp24 1329 %tmp26 = fsub float %tmp22, %tmp25 1330 %tmp27 = load float, float addrspace(3)* %tmp13 1331 %tmp28 = load float, float addrspace(3)* %tmp14 1332 %tmp29 = fmul float %tmp27, %tmp28 1333 %tmp30 = fsub float %tmp26, %tmp29 1334 store float %tmp30, float addrspace(1)* %tmp 1335 ret void 1336} 1337 1338define amdgpu_kernel void @ds_read_call_read(i32 addrspace(1)* %out, i32 addrspace(3)* %arg) { 1339; CI-LABEL: ds_read_call_read: 1340; CI: ; %bb.0: 1341; CI-NEXT: s_getpc_b64 s[40:41] 1342; CI-NEXT: s_mov_b32 s40, s0 1343; CI-NEXT: s_load_dwordx4 s[40:43], s[40:41], 0x0 1344; CI-NEXT: s_mov_b32 s14, s10 1345; CI-NEXT: s_mov_b32 s12, s8 1346; CI-NEXT: s_mov_b32 s13, s9 1347; CI-NEXT: v_lshlrev_b32_e32 v3, 2, v0 1348; CI-NEXT: s_waitcnt lgkmcnt(0) 1349; CI-NEXT: s_add_u32 s40, s40, s11 1350; CI-NEXT: s_mov_b64 s[10:11], s[6:7] 1351; CI-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0 1352; CI-NEXT: s_load_dword s6, s[4:5], 0x2 1353; CI-NEXT: s_addc_u32 s41, s41, 0 1354; CI-NEXT: s_add_u32 s8, s4, 12 1355; CI-NEXT: s_addc_u32 s9, s5, 0 1356; CI-NEXT: s_getpc_b64 s[4:5] 1357; CI-NEXT: s_add_u32 s4, s4, void_func_void@gotpcrel32@lo+4 1358; CI-NEXT: s_addc_u32 s5, s5, void_func_void@gotpcrel32@hi+12 1359; CI-NEXT: s_waitcnt lgkmcnt(0) 1360; CI-NEXT: v_add_i32_e32 v40, vcc, s6, v3 1361; CI-NEXT: s_mov_b32 m0, -1 1362; CI-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 1363; CI-NEXT: ds_read_b32 v41, v40 1364; CI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 1365; CI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 1366; CI-NEXT: v_or_b32_e32 v0, v0, v1 1367; CI-NEXT: s_mov_b64 s[4:5], s[0:1] 1368; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 1369; CI-NEXT: s_mov_b64 s[0:1], s[40:41] 1370; CI-NEXT: v_or_b32_e32 v31, v0, v2 1371; CI-NEXT: s_mov_b64 s[2:3], s[42:43] 1372; CI-NEXT: s_mov_b32 s32, 0 1373; CI-NEXT: s_mov_b32 s39, 0xf000 1374; CI-NEXT: s_mov_b32 s38, -1 1375; CI-NEXT: s_waitcnt lgkmcnt(0) 1376; CI-NEXT: s_swappc_b64 s[30:31], s[16:17] 1377; CI-NEXT: ds_read_b32 v0, v40 offset:4 1378; CI-NEXT: s_waitcnt lgkmcnt(0) 1379; CI-NEXT: v_add_i32_e32 v0, vcc, v41, v0 1380; CI-NEXT: buffer_store_dword v0, off, s[36:39], 0 1381; CI-NEXT: s_endpgm 1382; 1383; GFX9-LABEL: ds_read_call_read: 1384; GFX9: ; %bb.0: 1385; GFX9-NEXT: s_getpc_b64 s[36:37] 1386; GFX9-NEXT: s_mov_b32 s36, s0 1387; GFX9-NEXT: s_load_dwordx4 s[36:39], s[36:37], 0x0 1388; GFX9-NEXT: s_mov_b32 s14, s10 1389; GFX9-NEXT: s_mov_b32 s12, s8 1390; GFX9-NEXT: s_mov_b32 s13, s9 1391; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 1392; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1393; GFX9-NEXT: s_add_u32 s36, s36, s11 1394; GFX9-NEXT: s_addc_u32 s37, s37, 0 1395; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] 1396; GFX9-NEXT: s_load_dword s6, s[4:5], 0x8 1397; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 1398; GFX9-NEXT: s_add_u32 s8, s4, 12 1399; GFX9-NEXT: s_addc_u32 s9, s5, 0 1400; GFX9-NEXT: s_getpc_b64 s[4:5] 1401; GFX9-NEXT: s_add_u32 s4, s4, void_func_void@gotpcrel32@lo+4 1402; GFX9-NEXT: s_addc_u32 s5, s5, void_func_void@gotpcrel32@hi+12 1403; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1404; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s6 1405; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 1406; GFX9-NEXT: ds_read_b32 v42, v41 1407; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 1408; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] 1409; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] 1410; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] 1411; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 1412; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] 1413; GFX9-NEXT: s_mov_b32 s32, 0 1414; GFX9-NEXT: v_mov_b32_e32 v40, 0 1415; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1416; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] 1417; GFX9-NEXT: ds_read_b32 v0, v41 offset:4 1418; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1419; GFX9-NEXT: v_add_u32_e32 v0, v42, v0 1420; GFX9-NEXT: global_store_dword v40, v0, s[34:35] 1421; GFX9-NEXT: s_endpgm 1422 %x = call i32 @llvm.amdgcn.workitem.id.x() 1423 %arrayidx0 = getelementptr i32, i32 addrspace(3)* %arg, i32 %x 1424 %arrayidx1 = getelementptr i32, i32 addrspace(3)* %arrayidx0, i32 1 1425 %v0 = load i32, i32 addrspace(3)* %arrayidx0, align 4 1426 call void @void_func_void() 1427 %v1 = load i32, i32 addrspace(3)* %arrayidx1, align 4 1428 %r = add i32 %v0, %v1 1429 store i32 %r, i32 addrspace(1)* %out, align 4 1430 ret void 1431} 1432 1433define amdgpu_ps <2 x float> @ds_read_interp_read(i32 inreg %prims, float addrspace(3)* %inptr) { 1434; CI-LABEL: ds_read_interp_read: 1435; CI: ; %bb.0: 1436; CI-NEXT: s_mov_b32 m0, -1 1437; CI-NEXT: ds_read_b32 v2, v0 1438; CI-NEXT: s_mov_b32 m0, s0 1439; CI-NEXT: v_interp_mov_f32 v1, p10, attr0.x 1440; CI-NEXT: s_mov_b32 m0, -1 1441; CI-NEXT: ds_read_b32 v0, v0 offset:16 1442; CI-NEXT: s_waitcnt lgkmcnt(0) 1443; CI-NEXT: v_add_f32_e32 v1, v0, v1 1444; CI-NEXT: v_mov_b32_e32 v0, v2 1445; CI-NEXT: ; return to shader part epilog 1446; 1447; GFX9-LABEL: ds_read_interp_read: 1448; GFX9: ; %bb.0: 1449; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:4 1450; GFX9-NEXT: s_mov_b32 m0, s0 1451; GFX9-NEXT: s_nop 0 1452; GFX9-NEXT: v_interp_mov_f32_e32 v2, p10, attr0.x 1453; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1454; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 1455; GFX9-NEXT: ; return to shader part epilog 1456 %v0 = load float, float addrspace(3)* %inptr, align 4 1457 %intrp = call float @llvm.amdgcn.interp.mov(i32 0, i32 0, i32 0, i32 %prims) 1458 %ptr1 = getelementptr float, float addrspace(3)* %inptr, i32 4 1459 %v1 = load float, float addrspace(3)* %ptr1, align 4 1460 %v1b = fadd float %v1, %intrp 1461 %r0 = insertelement <2 x float> undef, float %v0, i32 0 1462 %r1 = insertelement <2 x float> %r0, float %v1b, i32 1 1463 ret <2 x float> %r1 1464} 1465 1466@v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1 1467 1468define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)* %out) { 1469; CI-LABEL: read2_v2i32_align1_odd_offset: 1470; CI: ; %bb.0: ; %entry 1471; CI-NEXT: v_mov_b32_e32 v0, 0 1472; CI-NEXT: s_mov_b32 m0, -1 1473; CI-NEXT: ds_read_u8 v1, v0 offset:70 1474; CI-NEXT: ds_read_u8 v2, v0 offset:72 1475; CI-NEXT: ds_read_u8 v3, v0 offset:71 1476; CI-NEXT: ds_read_u8 v4, v0 offset:69 1477; CI-NEXT: ds_read_u8 v5, v0 offset:68 1478; CI-NEXT: s_waitcnt lgkmcnt(4) 1479; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1480; CI-NEXT: s_waitcnt lgkmcnt(3) 1481; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 1482; CI-NEXT: s_waitcnt lgkmcnt(2) 1483; CI-NEXT: v_or_b32_e32 v2, v2, v3 1484; CI-NEXT: s_waitcnt lgkmcnt(1) 1485; CI-NEXT: v_or_b32_e32 v1, v1, v4 1486; CI-NEXT: ds_read_u8 v4, v0 offset:67 1487; CI-NEXT: ds_read_u8 v6, v0 offset:66 1488; CI-NEXT: ds_read_u8 v0, v0 offset:65 1489; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1490; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 1491; CI-NEXT: v_or_b32_e32 v1, v2, v1 1492; CI-NEXT: s_waitcnt lgkmcnt(0) 1493; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 1494; CI-NEXT: v_or_b32_e32 v0, v2, v0 1495; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 1496; CI-NEXT: v_or_b32_e32 v2, v2, v4 1497; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1498; CI-NEXT: s_mov_b32 s3, 0xf000 1499; CI-NEXT: s_mov_b32 s2, -1 1500; CI-NEXT: v_or_b32_e32 v0, v2, v0 1501; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1502; CI-NEXT: s_endpgm 1503; 1504; GFX9-ALIGNED-LABEL: read2_v2i32_align1_odd_offset: 1505; GFX9-ALIGNED: ; %bb.0: ; %entry 1506; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, 0 1507; GFX9-ALIGNED-NEXT: ds_read_u8 v0, v2 offset:70 1508; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v2 offset:65 1509; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v2 offset:66 1510; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v2 offset:67 1511; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v2 offset:68 1512; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v2 offset:69 1513; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v2 offset:72 1514; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v2 offset:71 1515; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(7) 1516; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1517; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 1518; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 1519; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 1520; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v7 1521; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1522; GFX9-ALIGNED-NEXT: v_or_b32_e32 v1, v1, v0 1523; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v4 1524; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v3 1525; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v6 1526; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1527; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v3, v0 1528; GFX9-ALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1529; GFX9-ALIGNED-NEXT: s_endpgm 1530; 1531; GFX9-UNALIGNED-LABEL: read2_v2i32_align1_odd_offset: 1532; GFX9-UNALIGNED: ; %bb.0: ; %entry 1533; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0 1534; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 1535; GFX9-UNALIGNED-NEXT: ds_read_b64 v[0:1], v2 offset:65 1536; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 1537; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1538; GFX9-UNALIGNED-NEXT: s_endpgm 1539entry: 1540 %load = load <2 x i32>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1 1541 store <2 x i32> %load, <2 x i32> addrspace(1)* %out 1542 ret void 1543} 1544 1545declare void @void_func_void() #3 1546 1547declare i32 @llvm.amdgcn.workgroup.id.x() #1 1548declare i32 @llvm.amdgcn.workgroup.id.y() #1 1549declare i32 @llvm.amdgcn.workitem.id.x() #1 1550declare i32 @llvm.amdgcn.workitem.id.y() #1 1551 1552declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) nounwind readnone 1553 1554declare void @llvm.amdgcn.s.barrier() #2 1555 1556attributes #0 = { nounwind } 1557attributes #1 = { nounwind readnone speculatable } 1558attributes #2 = { convergent nounwind } 1559attributes #3 = { nounwind noinline } 1560