1; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s 2; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s 3 4; GCN-LABEL: {{^}}fadd_v2_vv: 5; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 6; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] 7define amdgpu_kernel void @fadd_v2_vv(<2 x float> addrspace(1)* %a) { 8 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 9 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 10 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 11 %add = fadd <2 x float> %load, %load 12 store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8 13 ret void 14} 15 16; GCN-LABEL: {{^}}fadd_v2_vs: 17; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 18; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 19define amdgpu_kernel void @fadd_v2_vs(<2 x float> addrspace(1)* %a, <2 x float> %x) { 20 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 21 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 22 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 23 %add = fadd <2 x float> %load, %x 24 store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8 25 ret void 26} 27 28; GCN-LABEL: {{^}}fadd_v4_vs: 29; GFX900-COUNT-4: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 30; GFX90A-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 31define amdgpu_kernel void @fadd_v4_vs(<4 x float> addrspace(1)* %a, <4 x float> %x) { 32 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 33 %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a, i32 %id 34 %load = load <4 x float>, <4 x float> addrspace(1)* %gep, align 16 35 %add = fadd <4 x float> %load, %x 36 store <4 x float> %add, <4 x float> addrspace(1)* %gep, align 16 37 ret void 38} 39 40; GCN-LABEL: {{^}}fadd_v32_vs: 41; GFX900-COUNT-32: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 42; GFX90A-COUNT-16: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 43define amdgpu_kernel void @fadd_v32_vs(<32 x float> addrspace(1)* %a, <32 x float> %x) { 44 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 45 %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %a, i32 %id 46 %load = load <32 x float>, <32 x float> addrspace(1)* %gep, align 128 47 %add = fadd <32 x float> %load, %x 48 store <32 x float> %add, <32 x float> addrspace(1)* %gep, align 128 49 ret void 50} 51 52; GCN-LABEL: {{^}}fadd_v2_v_imm: 53; GFX90A: s_mov_b32 s[[K:[0-9]+]], 0x42c80000 54; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 0x42c80000, v{{[0-9]+}} 55; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}} 56define amdgpu_kernel void @fadd_v2_v_imm(<2 x float> addrspace(1)* %a) { 57 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 58 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 59 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 60 %add = fadd <2 x float> %load, <float 100.0, float 100.0> 61 store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8 62 ret void 63} 64 65; GCN-LABEL: {{^}}fadd_v2_v_v_splat: 66; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v0 67; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1] op_sel_hi:[1,0]{{$}} 68define amdgpu_kernel void @fadd_v2_v_v_splat(<2 x float> addrspace(1)* %a) { 69 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 70 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 71 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 72 %fid = bitcast i32 %id to float 73 %tmp1 = insertelement <2 x float> undef, float %fid, i64 0 74 %k = insertelement <2 x float> %tmp1, float %fid, i64 1 75 %add = fadd <2 x float> %load, %k 76 store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8 77 ret void 78} 79 80; GCN-LABEL: {{^}}fadd_v2_v_lit_splat: 81; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} 82; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0 op_sel_hi:[1,0]{{$}} 83define amdgpu_kernel void @fadd_v2_v_lit_splat(<2 x float> addrspace(1)* %a) { 84 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 85 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 86 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 87 %add = fadd <2 x float> %load, <float 1.0, float 1.0> 88 store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8 89 ret void 90} 91 92; GCN-LABEL: {{^}}fadd_v2_v_lit_hi0: 93; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 94; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} 95; GFX90A-DAG: s_mov_b64 [[K:s\[[0-9:]+\]]], 0x3f800000 96; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], [[K]] 97define amdgpu_kernel void @fadd_v2_v_lit_hi0(<2 x float> addrspace(1)* %a) { 98 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 99 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 100 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 101 %add = fadd <2 x float> %load, <float 1.0, float 0.0> 102 store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8 103 ret void 104} 105 106; GCN-LABEL: {{^}}fadd_v2_v_lit_lo0: 107; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 108; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} 109; GFX90A-DAG: s_mov_b32 s[[LO:[0-9]+]], 0 110; GFX90A-DAG: s_mov_b32 s[[HI:[0-9]+]], 1.0 111; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[LO]]:[[HI]]]{{$}} 112define amdgpu_kernel void @fadd_v2_v_lit_lo0(<2 x float> addrspace(1)* %a) { 113 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 114 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 115 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 116 %add = fadd <2 x float> %load, <float 0.0, float 1.0> 117 store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8 118 ret void 119} 120 121; GCN-LABEL: {{^}}fadd_v2_v_unfoldable_lit: 122; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} 123; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} 124; GFX90A-DAG: s_mov_b32 s{{[0-9]+}}, 1.0 125; GFX90A-DAG: s_mov_b32 s{{[0-9]+}}, 2.0 126; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 127define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(<2 x float> addrspace(1)* %a) { 128 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 129 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 130 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 131 %add = fadd <2 x float> %load, <float 1.0, float 2.0> 132 store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8 133 ret void 134} 135 136; GCN-LABEL: {{^}}fadd_v2_v_fneg: 137; GFX900-COUNT-2: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 138; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}} 139define amdgpu_kernel void @fadd_v2_v_fneg(<2 x float> addrspace(1)* %a, float %x) { 140 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 141 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 142 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 143 %fneg = fsub float -0.0, %x 144 %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0 145 %k = insertelement <2 x float> %tmp1, float %fneg, i64 1 146 %add = fadd <2 x float> %load, %k 147 store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8 148 ret void 149} 150 151; GCN-LABEL: {{^}}fadd_v2_v_fneg_lo: 152; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 153; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 154; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1]{{$}} 155define amdgpu_kernel void @fadd_v2_v_fneg_lo(<2 x float> addrspace(1)* %a, float %x) { 156 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 157 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 158 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 159 %fneg = fsub float -0.0, %x 160 %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0 161 %k = insertelement <2 x float> %tmp1, float %x, i64 1 162 %add = fadd <2 x float> %load, %k 163 store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8 164 ret void 165} 166 167; GCN-LABEL: {{^}}fadd_v2_v_fneg_hi: 168; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 169; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 170; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_hi:[0,1]{{$}} 171define amdgpu_kernel void @fadd_v2_v_fneg_hi(<2 x float> addrspace(1)* %a, float %x) { 172 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 173 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 174 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 175 %fneg = fsub float -0.0, %x 176 %tmp1 = insertelement <2 x float> undef, float %x, i64 0 177 %k = insertelement <2 x float> %tmp1, float %fneg, i64 1 178 %add = fadd <2 x float> %load, %k 179 store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8 180 ret void 181} 182 183; GCN-LABEL: {{^}}fadd_v2_v_fneg_lo2: 184; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 185; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 186; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] neg_lo:[0,1]{{$}} 187define amdgpu_kernel void @fadd_v2_v_fneg_lo2(<2 x float> addrspace(1)* %a, float %x, float %y) { 188 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 189 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 190 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 191 %fneg = fsub float -0.0, %x 192 %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0 193 %k = insertelement <2 x float> %tmp1, float %y, i64 1 194 %add = fadd <2 x float> %load, %k 195 store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8 196 ret void 197} 198 199; GCN-LABEL: {{^}}fadd_v2_v_fneg_hi2: 200; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 201; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 202; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0] neg_hi:[0,1]{{$}} 203define amdgpu_kernel void @fadd_v2_v_fneg_hi2(<2 x float> addrspace(1)* %a, float %x, float %y) { 204 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 205 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 206 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 207 %fneg = fsub float -0.0, %x 208 %tmp1 = insertelement <2 x float> undef, float %y, i64 0 209 %k = insertelement <2 x float> %tmp1, float %fneg, i64 1 210 %add = fadd <2 x float> %load, %k 211 store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8 212 ret void 213} 214 215; GCN-LABEL: {{^}}fmul_v2_vv: 216; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 217; GFX90A: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] 218define amdgpu_kernel void @fmul_v2_vv(<2 x float> addrspace(1)* %a) { 219 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 220 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 221 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 222 %mul = fmul <2 x float> %load, %load 223 store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8 224 ret void 225} 226 227; GCN-LABEL: {{^}}fmul_v2_vs: 228; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 229; GFX90A: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 230define amdgpu_kernel void @fmul_v2_vs(<2 x float> addrspace(1)* %a, <2 x float> %x) { 231 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 232 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 233 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 234 %mul = fmul <2 x float> %load, %x 235 store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8 236 ret void 237} 238 239; GCN-LABEL: {{^}}fmul_v4_vs: 240; GFX900-COUNT-4: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 241; GFX90A-COUNT-2: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 242define amdgpu_kernel void @fmul_v4_vs(<4 x float> addrspace(1)* %a, <4 x float> %x) { 243 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 244 %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a, i32 %id 245 %load = load <4 x float>, <4 x float> addrspace(1)* %gep, align 16 246 %mul = fmul <4 x float> %load, %x 247 store <4 x float> %mul, <4 x float> addrspace(1)* %gep, align 16 248 ret void 249} 250 251; GCN-LABEL: {{^}}fmul_v32_vs: 252; GFX900-COUNT-32: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 253; GFX90A-COUNT-16: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 254define amdgpu_kernel void @fmul_v32_vs(<32 x float> addrspace(1)* %a, <32 x float> %x) { 255 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 256 %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %a, i32 %id 257 %load = load <32 x float>, <32 x float> addrspace(1)* %gep, align 128 258 %mul = fmul <32 x float> %load, %x 259 store <32 x float> %mul, <32 x float> addrspace(1)* %gep, align 128 260 ret void 261} 262 263; GCN-LABEL: {{^}}fmul_v2_v_imm: 264; GFX90A: s_mov_b32 s[[K:[0-9]+]], 0x42c80000 265; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 0x42c80000, v{{[0-9]+}} 266; GFX90A: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}} 267define amdgpu_kernel void @fmul_v2_v_imm(<2 x float> addrspace(1)* %a) { 268 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 269 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 270 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 271 %mul = fmul <2 x float> %load, <float 100.0, float 100.0> 272 store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8 273 ret void 274} 275 276; GCN-LABEL: {{^}}fmul_v2_v_v_splat: 277; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v0 278; GFX90A: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1] op_sel_hi:[1,0]{{$}} 279define amdgpu_kernel void @fmul_v2_v_v_splat(<2 x float> addrspace(1)* %a) { 280 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 281 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 282 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 283 %fid = bitcast i32 %id to float 284 %tmp1 = insertelement <2 x float> undef, float %fid, i64 0 285 %k = insertelement <2 x float> %tmp1, float %fid, i64 1 286 %mul = fmul <2 x float> %load, %k 287 store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8 288 ret void 289} 290 291; GCN-LABEL: {{^}}fmul_v2_v_lit_splat: 292; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} 293; GFX90A: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0 op_sel_hi:[1,0]{{$}} 294define amdgpu_kernel void @fmul_v2_v_lit_splat(<2 x float> addrspace(1)* %a) { 295 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 296 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 297 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 298 %mul = fmul <2 x float> %load, <float 4.0, float 4.0> 299 store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8 300 ret void 301} 302 303; GCN-LABEL: {{^}}fmul_v2_v_unfoldable_lit: 304; GFX900-DAG: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} 305; GFX900-DAG: v_mul_f32_e32 v{{[0-9]+}}, 0x40400000, v{{[0-9]+}} 306; GFX90A-DAG: s_mov_b32 s{{[0-9]+}}, 4.0 307; GFX90A-DAG: s_mov_b32 s{{[0-9]+}}, 0x40400000 308; GFX90A: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 309define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(<2 x float> addrspace(1)* %a) { 310 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 311 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 312 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 313 %mul = fmul <2 x float> %load, <float 4.0, float 3.0> 314 store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8 315 ret void 316} 317 318; GCN-LABEL: {{^}}fmul_v2_v_fneg: 319; GFX900-COUNT-2: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -s{{[0-9]+}} 320; GFX90A: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}} 321define amdgpu_kernel void @fmul_v2_v_fneg(<2 x float> addrspace(1)* %a, float %x) { 322 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 323 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 324 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 325 %fneg = fsub float -0.0, %x 326 %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0 327 %k = insertelement <2 x float> %tmp1, float %fneg, i64 1 328 %mul = fmul <2 x float> %load, %k 329 store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8 330 ret void 331} 332 333; GCN-LABEL: {{^}}fma_v2_vv: 334; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 335; GFX90A: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] 336define amdgpu_kernel void @fma_v2_vv(<2 x float> addrspace(1)* %a) { 337 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 338 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 339 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 340 %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %load, <2 x float> %load) 341 store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8 342 ret void 343} 344 345; GCN-LABEL: {{^}}fma_v2_vs: 346; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} 347; GFX90A: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 348define amdgpu_kernel void @fma_v2_vs(<2 x float> addrspace(1)* %a, <2 x float> %x) { 349 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 350 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 351 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 352 %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %x, <2 x float> %x) 353 store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8 354 ret void 355} 356 357; GCN-LABEL: {{^}}fma_v4_vs: 358; GFX900-COUNT-4: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} 359; GFX90A-COUNT-2: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 360define amdgpu_kernel void @fma_v4_vs(<4 x float> addrspace(1)* %a, <4 x float> %x) { 361 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 362 %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a, i32 %id 363 %load = load <4 x float>, <4 x float> addrspace(1)* %gep, align 16 364 %fma = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %load, <4 x float> %x, <4 x float> %x) 365 store <4 x float> %fma, <4 x float> addrspace(1)* %gep, align 16 366 ret void 367} 368 369; GCN-LABEL: {{^}}fma_v32_vs: 370; GFX900-COUNT-32: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} 371; GFX90A-COUNT-16: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 372define amdgpu_kernel void @fma_v32_vs(<32 x float> addrspace(1)* %a, <32 x float> %x) { 373 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 374 %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %a, i32 %id 375 %load = load <32 x float>, <32 x float> addrspace(1)* %gep, align 128 376 %fma = tail call <32 x float> @llvm.fma.v32f32(<32 x float> %load, <32 x float> %x, <32 x float> %x) 377 store <32 x float> %fma, <32 x float> addrspace(1)* %gep, align 128 378 ret void 379} 380 381; GCN-LABEL: {{^}}fma_v2_v_imm: 382; GCN-DAG: s_mov_b32 s[[K1:[0-9]+]], 0x42c80000 383; GCN-DAG: v_mov_b32_e32 v[[K2:[0-9]+]], 0x43480000 384; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s[[K1]], v[[K2]] 385; GFX90A: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K1]]:{{[0-9:]+}}], v[[[K2]]:{{[0-9:]+}}] op_sel_hi:[1,0,0]{{$}} 386define amdgpu_kernel void @fma_v2_v_imm(<2 x float> addrspace(1)* %a) { 387 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 388 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 389 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 390 %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> <float 100.0, float 100.0>, <2 x float> <float 200.0, float 200.0>) 391 store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8 392 ret void 393} 394 395; GCN-LABEL: {{^}}fma_v2_v_v_splat: 396; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v0, v0 397; GFX90A: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1], v[0:1] op_sel_hi:[1,0,0]{{$}} 398define amdgpu_kernel void @fma_v2_v_v_splat(<2 x float> addrspace(1)* %a) { 399 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 400 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 401 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 402 %fid = bitcast i32 %id to float 403 %tmp1 = insertelement <2 x float> undef, float %fid, i64 0 404 %k = insertelement <2 x float> %tmp1, float %fid, i64 1 405 %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %k, <2 x float> %k) 406 store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8 407 ret void 408} 409 410; GCN-LABEL: {{^}}fma_v2_v_lit_splat: 411; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, 4.0, 1.0 412; GFX90A: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0 op_sel_hi:[1,0,0]{{$}} 413define amdgpu_kernel void @fma_v2_v_lit_splat(<2 x float> addrspace(1)* %a) { 414 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 415 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 416 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 417 %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> <float 4.0, float 4.0>, <2 x float> <float 1.0, float 1.0>) 418 store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8 419 ret void 420} 421 422; GCN-LABEL: {{^}}fma_v2_v_unfoldable_lit: 423; GCN-DAG: s_mov_b32 s{{[0-9]+}}, 0x40400000 424; GFX900-DAG: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, 4.0, 1.0 425; GFX900-DAG: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, 2.0 426; GFX90A-DAG: s_mov_b32 s{{[0-9]+}}, 4.0 427; GFX90A-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0 428; GFX90A-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0 429; GFX90A: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} 430define amdgpu_kernel void @fma_v2_v_unfoldable_lit(<2 x float> addrspace(1)* %a) { 431 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 432 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 433 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 434 %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> <float 4.0, float 3.0>, <2 x float> <float 1.0, float 2.0>) 435 store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8 436 ret void 437} 438 439; GCN-LABEL: {{^}}fma_v2_v_fneg: 440; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -s{{[0-9]+}}, -s{{[0-9]+}} 441; GFX90A: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0,0] neg_lo:[0,1,1] neg_hi:[0,1,1]{{$}} 442define amdgpu_kernel void @fma_v2_v_fneg(<2 x float> addrspace(1)* %a, float %x) { 443 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 444 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 445 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 446 %fneg = fsub float -0.0, %x 447 %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0 448 %k = insertelement <2 x float> %tmp1, float %fneg, i64 1 449 %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %k, <2 x float> %k) 450 store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8 451 ret void 452} 453 454; GCN-LABEL: {{^}}add_vector_neg_bitcast_scalar_lo: 455; GFX900-COUNT-2: v_sub_f32_e32 456; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] 457define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %lds, float addrspace(3)* %arg2) { 458bb: 459 %vec0 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds, align 4 460 %scalar0 = load volatile float, float addrspace(3)* %arg2, align 4 461 %neg.scalar0 = fsub float -0.0, %scalar0 462 463 %neg.scalar0.vec = insertelement <2 x float> undef, float %neg.scalar0, i32 0 464 %neg.scalar0.broadcast = shufflevector <2 x float> %neg.scalar0.vec, <2 x float> undef, <2 x i32> zeroinitializer 465 466 %result = fadd <2 x float> %vec0, %neg.scalar0.broadcast 467 store <2 x float> %result, <2 x float> addrspace(1)* %out, align 4 468 ret void 469} 470 471; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo_scalar_hi: 472; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} 473; GFX90A: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] neg_lo:[0,0,1] neg_hi:[0,0,1] 474define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %lds, float addrspace(3)* %arg2) { 475bb: 476 %lds.gep1 = getelementptr inbounds <2 x float>, <2 x float> addrspace(3)* %lds, i32 1 477 %arg2.gep = getelementptr inbounds float, float addrspace(3)* %arg2, i32 2 478 479 %vec0 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds, align 4 480 %vec1 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds.gep1, align 4 481 482 %scalar0 = load volatile float, float addrspace(3)* %arg2, align 4 483 %scalar1 = load volatile float, float addrspace(3)* %arg2.gep, align 4 484 485 %vec.ins0 = insertelement <2 x float> undef, float %scalar0, i32 0 486 %vec2 = insertelement <2 x float> %vec.ins0, float %scalar1, i32 1 487 %neg.vec2 = fsub <2 x float> <float -0.0, float -0.0>, %vec2 488 489 %result = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %vec0, <2 x float> %vec1, <2 x float> %neg.vec2) 490 store <2 x float> %result, <2 x float> addrspace(1)* %out, align 4 491 ret void 492} 493 494; GCN-LABEL: {{^}}shuffle_add_f32: 495; GFX900-COUNT-2: v_add_f32_e32 496; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0]{{$}} 497define amdgpu_kernel void @shuffle_add_f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %lds) #0 { 498bb: 499 %vec0 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds, align 8 500 %lds.gep1 = getelementptr inbounds <2 x float>, <2 x float> addrspace(3)* %lds, i32 1 501 %vec1 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds.gep1, align 8 502 %vec1.swap = shufflevector <2 x float> %vec1, <2 x float> undef, <2 x i32> <i32 1, i32 0> 503 %result = fadd <2 x float> %vec0, %vec1.swap 504 store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8 505 ret void 506} 507 508; GCN-LABEL: {{^}}shuffle_neg_add_f32: 509; GFX900-COUNT-2: v_sub_f32_e32 510; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}} 511define amdgpu_kernel void @shuffle_neg_add_f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %lds) #0 { 512bb: 513 %vec0 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds, align 8 514 %lds.gep1 = getelementptr inbounds <2 x float>, <2 x float> addrspace(3)* %lds, i32 1 515 %f32 = load volatile float, float addrspace(3)* undef, align 8 516 %vec1 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds.gep1, align 8 517 %vec1.neg = fsub <2 x float> <float -0.0, float -0.0>, %vec1 518 %vec1.neg.swap = shufflevector <2 x float> %vec1.neg, <2 x float> undef, <2 x i32> <i32 1, i32 0> 519 %result = fadd <2 x float> %vec0, %vec1.neg.swap 520 store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8 521 ret void 522} 523 524; GCN-LABEL: {{^}}fadd_fadd_fsub: 525; GFX900: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0 526; GFX900: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 527; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 op_sel_hi:[1,0] 528; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 op_sel_hi:[1,0] 529define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg) { 530bb: 531 %i12 = fadd <2 x float> zeroinitializer, %arg 532 %shift8 = shufflevector <2 x float> %i12, <2 x float> undef, <2 x i32> <i32 1, i32 undef> 533 %i13 = fadd <2 x float> zeroinitializer, %shift8 534 %i14 = shufflevector <2 x float> %arg, <2 x float> %i13, <2 x i32> <i32 0, i32 2> 535 %i15 = fsub <2 x float> %i14, zeroinitializer 536 store <2 x float> %i15, <2 x float>* undef 537 ret void 538} 539 540; GCN-LABEL: {{^}}fadd_shuffle_v4: 541; GFX900-COUNT-4: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 542; GFX90A-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0] 543define amdgpu_kernel void @fadd_shuffle_v4(<4 x float> addrspace(1)* %arg) { 544bb: 545 %tid = call i32 @llvm.amdgcn.workitem.id.x() 546 %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i32 %tid 547 %in.1 = load <4 x float>, <4 x float> addrspace(1)* %gep 548 %shuf = shufflevector <4 x float> %in.1, <4 x float> undef, <4 x i32> zeroinitializer 549 %add.1 = fadd <4 x float> %in.1, %shuf 550 store <4 x float> %add.1, <4 x float> addrspace(1)* %gep 551 ret void 552} 553 554; GCN-LABEL: {{^}}fneg_v2f32_vec: 555; GFX900-COUNT-2: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} 556; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 neg_lo:[1,1] neg_hi:[1,1]{{$}} 557define amdgpu_kernel void @fneg_v2f32_vec(<2 x float> addrspace(1)* %a) { 558 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 559 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id 560 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 561 %fneg = fsub <2 x float> <float -0.0, float -0.0>, %load 562 store <2 x float> %fneg, <2 x float> addrspace(1)* %gep, align 8 563 ret void 564} 565 566; GCN-LABEL: {{^}}fneg_v2f32_scalar: 567; GCN-COUNT-2: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 568define amdgpu_kernel void @fneg_v2f32_scalar(<2 x float> addrspace(1)* %a, <2 x float> %x) { 569 %fneg = fsub <2 x float> <float -0.0, float -0.0>, %x 570 store <2 x float> %fneg, <2 x float> addrspace(1)* %a, align 8 571 ret void 572} 573 574declare i32 @llvm.amdgcn.workitem.id.x() 575declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) 576declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) 577declare <32 x float> @llvm.fma.v32f32(<32 x float>, <32 x float>, <32 x float>) 578