1; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s 3; FIXME: Test with SI when argument lowering not broken for f16 4 5; Natural mapping 6define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { 7 ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset 8 ; CHECK: bb.1 (%ir-block.0): 9 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 10 ; CHECK-NEXT: {{ $}} 11 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 12 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 13 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 14 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 15 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 16 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 17 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 18 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) 19 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] 20 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 21 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) 22 ret float %val 23} 24 25; Copies for VGPR arguments 26define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 inreg %voffset, i32 inreg %soffset) { 27 ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset 28 ; CHECK: bb.1 (%ir-block.0): 29 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7 30 ; CHECK-NEXT: {{ $}} 31 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 32 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 33 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 34 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 35 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 36 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 37 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr7 38 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] 39 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) 40 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] 41 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 42 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) 43 ret float %val 44} 45 46; Waterfall for rsrc 47define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> %rsrc, i32 %voffset, i32 inreg %soffset) { 48 ; CHECK-LABEL: name: raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset 49 ; CHECK: bb.1 (%ir-block.0): 50 ; CHECK-NEXT: successors: %bb.2(0x80000000) 51 ; CHECK-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 52 ; CHECK-NEXT: {{ $}} 53 ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 54 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 55 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 56 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 57 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 58 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 59 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 60 ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec 61 ; CHECK-NEXT: {{ $}} 62 ; CHECK-NEXT: bb.2: 63 ; CHECK-NEXT: successors: %bb.3(0x80000000) 64 ; CHECK-NEXT: {{ $}} 65 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 66 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 67 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub2 68 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub3 69 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec 70 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec 71 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec 72 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec 73 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 74 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 75 ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 76 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 77 ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 78 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec 79 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec 80 ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc 81 ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec 82 ; CHECK-NEXT: {{ $}} 83 ; CHECK-NEXT: bb.3: 84 ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) 85 ; CHECK-NEXT: {{ $}} 86 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) 87 ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc 88 ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec 89 ; CHECK-NEXT: {{ $}} 90 ; CHECK-NEXT: bb.4: 91 ; CHECK-NEXT: successors: %bb.5(0x80000000) 92 ; CHECK-NEXT: {{ $}} 93 ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] 94 ; CHECK-NEXT: {{ $}} 95 ; CHECK-NEXT: bb.5: 96 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] 97 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 98 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) 99 ret float %val 100} 101 102; Waterfall for rsrc and soffset 103define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset(<4 x i32> %rsrc, i32 %voffset, i32 %soffset) { 104 ; CHECK-LABEL: name: raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset 105 ; CHECK: bb.1 (%ir-block.0): 106 ; CHECK-NEXT: successors: %bb.2(0x80000000) 107 ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 108 ; CHECK-NEXT: {{ $}} 109 ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 110 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 111 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 112 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 113 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 114 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 115 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 116 ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec 117 ; CHECK-NEXT: {{ $}} 118 ; CHECK-NEXT: bb.2: 119 ; CHECK-NEXT: successors: %bb.3(0x80000000) 120 ; CHECK-NEXT: {{ $}} 121 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 122 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 123 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub2 124 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub3 125 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec 126 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec 127 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec 128 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec 129 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 130 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 131 ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 132 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 133 ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 134 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec 135 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec 136 ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc 137 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec 138 ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec 139 ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc 140 ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec 141 ; CHECK-NEXT: {{ $}} 142 ; CHECK-NEXT: bb.3: 143 ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) 144 ; CHECK-NEXT: {{ $}} 145 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) 146 ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc 147 ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec 148 ; CHECK-NEXT: {{ $}} 149 ; CHECK-NEXT: bb.4: 150 ; CHECK-NEXT: successors: %bb.5(0x80000000) 151 ; CHECK-NEXT: {{ $}} 152 ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] 153 ; CHECK-NEXT: {{ $}} 154 ; CHECK-NEXT: bb.5: 155 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] 156 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 157 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) 158 ret float %val 159} 160 161; Natural mapping + glc 162define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { 163 ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc 164 ; CHECK: bb.1 (%ir-block.0): 165 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 166 ; CHECK-NEXT: {{ $}} 167 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 168 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 169 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 170 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 171 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 172 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 173 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 174 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) 175 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] 176 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 177 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 1) 178 ret float %val 179} 180 181; Natural mapping + slc 182define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { 183 ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc 184 ; CHECK: bb.1 (%ir-block.0): 185 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 186 ; CHECK-NEXT: {{ $}} 187 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 188 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 189 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 190 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 191 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 192 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 193 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 194 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 2, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) 195 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] 196 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 197 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) 198 ret float %val 199} 200 201; Natural mapping + dlc 202define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { 203 ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc 204 ; CHECK: bb.1 (%ir-block.0): 205 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 206 ; CHECK-NEXT: {{ $}} 207 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 208 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 209 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 210 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 211 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 212 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 213 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 214 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) 215 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] 216 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 217 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 4) 218 ret float %val 219} 220 221; Natural mapping + slc + dlc 222define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_dlc(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { 223 ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_dlc 224 ; CHECK: bb.1 (%ir-block.0): 225 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 226 ; CHECK-NEXT: {{ $}} 227 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 228 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 229 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 230 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 231 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 232 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 233 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 234 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 6, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) 235 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] 236 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 237 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 6) 238 ret float %val 239} 240 241; Natural mapping + glc + dlc 242define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc_dlc(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { 243 ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc_dlc 244 ; CHECK: bb.1 (%ir-block.0): 245 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 246 ; CHECK-NEXT: {{ $}} 247 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 248 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 249 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 250 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 251 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 252 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 253 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 254 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 5, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) 255 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] 256 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 257 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 5) 258 ret float %val 259} 260 261; Natural mapping + glc + slc + dlc 262define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc_slc_dlc(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { 263 ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc_slc_dlc 264 ; CHECK: bb.1 (%ir-block.0): 265 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 266 ; CHECK-NEXT: {{ $}} 267 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 268 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 269 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 270 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 271 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 272 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 273 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 274 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 7, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) 275 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] 276 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 277 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 7) 278 ret float %val 279} 280 281; Natural mapping 282define amdgpu_ps <2 x float> @raw_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { 283 ; CHECK-LABEL: name: raw_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset 284 ; CHECK: bb.1 (%ir-block.0): 285 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 286 ; CHECK-NEXT: {{ $}} 287 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 288 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 289 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 290 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 291 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 292 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 293 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 294 ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 4) 295 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0 296 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1 297 ; CHECK-NEXT: $vgpr0 = COPY [[COPY6]] 298 ; CHECK-NEXT: $vgpr1 = COPY [[COPY7]] 299 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 300 %val = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) 301 ret <2 x float> %val 302} 303 304define amdgpu_ps <3 x float> @raw_buffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { 305 ; CHECK-LABEL: name: raw_buffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset 306 ; CHECK: bb.1 (%ir-block.0): 307 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 308 ; CHECK-NEXT: {{ $}} 309 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 310 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 311 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 312 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 313 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 314 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 315 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 316 ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX3_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 4) 317 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub0 318 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub1 319 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub2 320 ; CHECK-NEXT: $vgpr0 = COPY [[COPY6]] 321 ; CHECK-NEXT: $vgpr1 = COPY [[COPY7]] 322 ; CHECK-NEXT: $vgpr2 = COPY [[COPY8]] 323 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 324 %val = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) 325 ret <3 x float> %val 326} 327 328define amdgpu_ps <4 x float> @raw_buffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { 329 ; CHECK-LABEL: name: raw_buffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset 330 ; CHECK: bb.1 (%ir-block.0): 331 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 332 ; CHECK-NEXT: {{ $}} 333 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 334 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 335 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 336 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 337 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 338 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 339 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 340 ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 4) 341 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 342 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 343 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 344 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub3 345 ; CHECK-NEXT: $vgpr0 = COPY [[COPY6]] 346 ; CHECK-NEXT: $vgpr1 = COPY [[COPY7]] 347 ; CHECK-NEXT: $vgpr2 = COPY [[COPY8]] 348 ; CHECK-NEXT: $vgpr3 = COPY [[COPY9]] 349 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 350 %val = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) 351 ret <4 x float> %val 352} 353 354define amdgpu_ps half @raw_buffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { 355 ; CHECK-LABEL: name: raw_buffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset 356 ; CHECK: bb.1 (%ir-block.0): 357 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 358 ; CHECK-NEXT: {{ $}} 359 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 360 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 361 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 362 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 363 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 364 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 365 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 366 ; CHECK-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 4) 367 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] 368 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 369 %val = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) 370 ret half %val 371} 372 373define amdgpu_ps <2 x half> @raw_buffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { 374 ; CHECK-LABEL: name: raw_buffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset 375 ; CHECK: bb.1 (%ir-block.0): 376 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 377 ; CHECK-NEXT: {{ $}} 378 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 379 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 380 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 381 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 382 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 383 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 384 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 385 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>), align 1, addrspace 4) 386 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] 387 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 388 %val = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) 389 ret <2 x half> %val 390} 391 392; FIXME: Crashes 393; define amdgpu_ps <3 x half> @raw_buffer_load_v3f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { 394; %val = call <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) 395; ret <3 x half> %val 396; } 397 398define amdgpu_ps <4 x half> @raw_buffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { 399 ; CHECK-LABEL: name: raw_buffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset 400 ; CHECK: bb.1 (%ir-block.0): 401 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 402 ; CHECK-NEXT: {{ $}} 403 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 404 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 405 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 406 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 407 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 408 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 409 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 410 ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 4) 411 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0 412 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1 413 ; CHECK-NEXT: $vgpr0 = COPY [[COPY6]] 414 ; CHECK-NEXT: $vgpr1 = COPY [[COPY7]] 415 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 416 %val = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) 417 ret <4 x half> %val 418} 419 420define amdgpu_ps float @raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zext(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { 421 ; CHECK-LABEL: name: raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zext 422 ; CHECK: bb.1 (%ir-block.0): 423 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 424 ; CHECK-NEXT: {{ $}} 425 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 426 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 427 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 428 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 429 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 430 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 431 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 432 ; CHECK-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 4) 433 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] 434 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 435 %val = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) 436 %zext = zext i8 %val to i32 437 %cast = bitcast i32 %zext to float 438 ret float %cast 439} 440 441define amdgpu_ps float @raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sext(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { 442 ; CHECK-LABEL: name: raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sext 443 ; CHECK: bb.1 (%ir-block.0): 444 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 445 ; CHECK-NEXT: {{ $}} 446 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 447 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 448 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 449 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 450 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 451 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 452 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 453 ; CHECK-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 4) 454 ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_UBYTE_OFFEN]], 0, 8, implicit $exec 455 ; CHECK-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]] 456 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 457 %val = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) 458 %zext = sext i8 %val to i32 459 %cast = bitcast i32 %zext to float 460 ret float %cast 461} 462 463; Waterfall for rsrc 464define amdgpu_ps half @raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> %rsrc, i32 %voffset, i32 inreg %soffset) { 465 ; CHECK-LABEL: name: raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset 466 ; CHECK: bb.1 (%ir-block.0): 467 ; CHECK-NEXT: successors: %bb.2(0x80000000) 468 ; CHECK-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 469 ; CHECK-NEXT: {{ $}} 470 ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 471 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 472 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 473 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 474 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 475 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 476 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 477 ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec 478 ; CHECK-NEXT: {{ $}} 479 ; CHECK-NEXT: bb.2: 480 ; CHECK-NEXT: successors: %bb.3(0x80000000) 481 ; CHECK-NEXT: {{ $}} 482 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 483 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 484 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub2 485 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub3 486 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec 487 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec 488 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec 489 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec 490 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 491 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 492 ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 493 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 494 ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 495 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec 496 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec 497 ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc 498 ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec 499 ; CHECK-NEXT: {{ $}} 500 ; CHECK-NEXT: bb.3: 501 ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) 502 ; CHECK-NEXT: {{ $}} 503 ; CHECK-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 4) 504 ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc 505 ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec 506 ; CHECK-NEXT: {{ $}} 507 ; CHECK-NEXT: bb.4: 508 ; CHECK-NEXT: successors: %bb.5(0x80000000) 509 ; CHECK-NEXT: {{ $}} 510 ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] 511 ; CHECK-NEXT: {{ $}} 512 ; CHECK-NEXT: bb.5: 513 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] 514 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 515 %val = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) 516 ret half %val 517} 518 519; Waterfall for rsrc 520define amdgpu_ps float @raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> %rsrc, i32 %voffset, i32 inreg %soffset) { 521 ; CHECK-LABEL: name: raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset 522 ; CHECK: bb.1 (%ir-block.0): 523 ; CHECK-NEXT: successors: %bb.2(0x80000000) 524 ; CHECK-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 525 ; CHECK-NEXT: {{ $}} 526 ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 527 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 528 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 529 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 530 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 531 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 532 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 533 ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec 534 ; CHECK-NEXT: {{ $}} 535 ; CHECK-NEXT: bb.2: 536 ; CHECK-NEXT: successors: %bb.3(0x80000000) 537 ; CHECK-NEXT: {{ $}} 538 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 539 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 540 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub2 541 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub3 542 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec 543 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec 544 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec 545 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec 546 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 547 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 548 ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 549 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 550 ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 551 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec 552 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec 553 ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc 554 ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec 555 ; CHECK-NEXT: {{ $}} 556 ; CHECK-NEXT: bb.3: 557 ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) 558 ; CHECK-NEXT: {{ $}} 559 ; CHECK-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 4) 560 ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc 561 ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec 562 ; CHECK-NEXT: {{ $}} 563 ; CHECK-NEXT: bb.4: 564 ; CHECK-NEXT: successors: %bb.5(0x80000000) 565 ; CHECK-NEXT: {{ $}} 566 ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] 567 ; CHECK-NEXT: {{ $}} 568 ; CHECK-NEXT: bb.5: 569 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] 570 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 571 %val = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) 572 %zext = zext i8 %val to i32 573 %cast = bitcast i32 %zext to float 574 ret float %cast 575} 576 577define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__voffset0(<4 x i32> inreg %rsrc, i32 inreg %soffset) { 578 ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__voffset0 579 ; CHECK: bb.1 (%ir-block.0): 580 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 581 ; CHECK-NEXT: {{ $}} 582 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 583 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 584 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 585 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 586 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 587 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 588 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[COPY4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) 589 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] 590 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 591 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) 592 ret float %val 593} 594 595define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset4095(<4 x i32> inreg %rsrc, i32 inreg %soffset) { 596 ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset4095 597 ; CHECK: bb.1 (%ir-block.0): 598 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 599 ; CHECK-NEXT: {{ $}} 600 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 601 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 602 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 603 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 604 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 605 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 606 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[COPY4]], 4095, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) 607 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] 608 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 609 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) 610 ret float %val 611} 612 613define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset4096(<4 x i32> inreg %rsrc, i32 inreg %soffset) { 614 ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset4096 615 ; CHECK: bb.1 (%ir-block.0): 616 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 617 ; CHECK-NEXT: {{ $}} 618 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 619 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 620 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 621 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 622 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 623 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 624 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 625 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] 626 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE]], [[COPY4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) 627 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] 628 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 629 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0) 630 ret float %val 631} 632 633define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add16(<4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { 634 ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add16 635 ; CHECK: bb.1 (%ir-block.0): 636 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 637 ; CHECK-NEXT: {{ $}} 638 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 639 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 640 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 641 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 642 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 643 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 644 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 645 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 16, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) 646 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] 647 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 648 %voffset = add i32 %voffset.base, 16 649 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) 650 ret float %val 651} 652 653define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095(<4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { 654 ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095 655 ; CHECK: bb.1 (%ir-block.0): 656 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 657 ; CHECK-NEXT: {{ $}} 658 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 659 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 660 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 661 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 662 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 663 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 664 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 665 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) 666 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] 667 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 668 %voffset = add i32 %voffset.base, 4095 669 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) 670 ret float %val 671} 672 673define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4096(<4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { 674 ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4096 675 ; CHECK: bb.1 (%ir-block.0): 676 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 677 ; CHECK-NEXT: {{ $}} 678 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 679 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 680 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 681 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 682 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 683 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 684 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 685 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 686 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] 687 ; CHECK-NEXT: %10:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec 688 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %10, [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) 689 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] 690 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 691 %voffset = add i32 %voffset.base, 4096 692 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) 693 ret float %val 694} 695 696define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset4095(<4 x i32> inreg %rsrc, i32 %voffset) { 697 ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset4095 698 ; CHECK: bb.1 (%ir-block.0): 699 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 700 ; CHECK-NEXT: {{ $}} 701 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 702 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 703 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 704 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 705 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 706 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 707 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 708 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) 709 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] 710 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 711 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) 712 ret float %val 713} 714 715define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset4096(<4 x i32> inreg %rsrc, i32 %voffset) { 716 ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset4096 717 ; CHECK: bb.1 (%ir-block.0): 718 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 719 ; CHECK-NEXT: {{ $}} 720 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 721 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 722 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 723 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 724 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 725 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 726 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 727 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) 728 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] 729 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 730 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) 731 ret float %val 732} 733 734define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add16(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset.base) { 735 ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add16 736 ; CHECK: bb.1 (%ir-block.0): 737 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 738 ; CHECK-NEXT: {{ $}} 739 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 740 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 741 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 742 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 743 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 744 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 745 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 746 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 747 ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc 748 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) 749 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] 750 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 751 %soffset = add i32 %soffset.base, 16 752 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) 753 ret float %val 754} 755 756define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add4095(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset.base) { 757 ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add4095 758 ; CHECK: bb.1 (%ir-block.0): 759 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 760 ; CHECK-NEXT: {{ $}} 761 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 762 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 763 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 764 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 765 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 766 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 767 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 768 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 769 ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc 770 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) 771 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] 772 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 773 %soffset = add i32 %soffset.base, 4095 774 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) 775 ret float %val 776} 777 778define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add4096(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset.base) { 779 ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add4096 780 ; CHECK: bb.1 (%ir-block.0): 781 ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 782 ; CHECK-NEXT: {{ $}} 783 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 784 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 785 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 786 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 787 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 788 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 789 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 790 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 791 ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc 792 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) 793 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] 794 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 795 %soffset = add i32 %soffset.base, 4096 796 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) 797 ret float %val 798} 799 800; An add of the offset is necessary, with a waterfall loop. Make sure the add is done outside of the waterfall loop. 801define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add5000(<4 x i32> %rsrc, i32 %voffset, i32 inreg %soffset.base) { 802 ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add5000 803 ; CHECK: bb.1 (%ir-block.0): 804 ; CHECK-NEXT: successors: %bb.2(0x80000000) 805 ; CHECK-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 806 ; CHECK-NEXT: {{ $}} 807 ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 808 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 809 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 810 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 811 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 812 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 813 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 814 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 815 ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc 816 ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec 817 ; CHECK-NEXT: {{ $}} 818 ; CHECK-NEXT: bb.2: 819 ; CHECK-NEXT: successors: %bb.3(0x80000000) 820 ; CHECK-NEXT: {{ $}} 821 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 822 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 823 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub2 824 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub3 825 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec 826 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec 827 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec 828 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec 829 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 830 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 831 ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 832 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 833 ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 834 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec 835 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec 836 ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc 837 ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec 838 ; CHECK-NEXT: {{ $}} 839 ; CHECK-NEXT: bb.3: 840 ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) 841 ; CHECK-NEXT: {{ $}} 842 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_ADD_I32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) 843 ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc 844 ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec 845 ; CHECK-NEXT: {{ $}} 846 ; CHECK-NEXT: bb.4: 847 ; CHECK-NEXT: successors: %bb.5(0x80000000) 848 ; CHECK-NEXT: {{ $}} 849 ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] 850 ; CHECK-NEXT: {{ $}} 851 ; CHECK-NEXT: bb.5: 852 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] 853 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 854 %soffset = add i32 %soffset.base, 5000 855 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) 856 ret float %val 857} 858 859; An add of the offset is necessary, with a waterfall loop. Make sure the add is done outside of the waterfall loop. 860define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add5000(<4 x i32> %rsrc, i32 %voffset.base, i32 inreg %soffset) { 861 ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add5000 862 ; CHECK: bb.1 (%ir-block.0): 863 ; CHECK-NEXT: successors: %bb.2(0x80000000) 864 ; CHECK-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 865 ; CHECK-NEXT: {{ $}} 866 ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 867 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 868 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 869 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 870 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 871 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 872 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 873 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 874 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] 875 ; CHECK-NEXT: %14:vgpr_32, dead %38:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec 876 ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec 877 ; CHECK-NEXT: {{ $}} 878 ; CHECK-NEXT: bb.2: 879 ; CHECK-NEXT: successors: %bb.3(0x80000000) 880 ; CHECK-NEXT: {{ $}} 881 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 882 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 883 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub2 884 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub3 885 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec 886 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec 887 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec 888 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec 889 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 890 ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 891 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 892 ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 893 ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 894 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec 895 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec 896 ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc 897 ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec 898 ; CHECK-NEXT: {{ $}} 899 ; CHECK-NEXT: bb.3: 900 ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) 901 ; CHECK-NEXT: {{ $}} 902 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %14, [[REG_SEQUENCE1]], [[COPY5]], 904, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) 903 ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc 904 ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec 905 ; CHECK-NEXT: {{ $}} 906 ; CHECK-NEXT: bb.4: 907 ; CHECK-NEXT: successors: %bb.5(0x80000000) 908 ; CHECK-NEXT: {{ $}} 909 ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] 910 ; CHECK-NEXT: {{ $}} 911 ; CHECK-NEXT: bb.5: 912 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] 913 ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 914 %voffset = add i32 %voffset.base, 5000 915 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) 916 ret float %val 917} 918 919declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32 immarg) 920declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32 immarg) 921declare <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32>, i32, i32, i32 immarg) 922declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32 immarg) 923 924declare half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32>, i32, i32, i32 immarg) 925declare <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32>, i32, i32, i32 immarg) 926declare <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(<4 x i32>, i32, i32, i32 immarg) 927declare <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32>, i32, i32, i32 immarg) 928 929declare i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32>, i32, i32, i32 immarg) 930