1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s 2; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s 3 4declare half @llvm.fma.f16(half %a, half %b, half %c) 5declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) 6 7; GCN-LABEL: {{^}}fma_f16 8; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 9; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 10; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] 11; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 12; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 13; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] 14; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]] 15; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 16; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] 17; GCN: buffer_store_short v[[R_F16]] 18; GCN: s_endpgm 19define void @fma_f16( 20 half addrspace(1)* %r, 21 half addrspace(1)* %a, 22 half addrspace(1)* %b, 23 half addrspace(1)* %c) { 24 %a.val = load half, half addrspace(1)* %a 25 %b.val = load half, half addrspace(1)* %b 26 %c.val = load half, half addrspace(1)* %c 27 %r.val = call half @llvm.fma.f16(half %a.val, half %b.val, half %c.val) 28 store half %r.val, half addrspace(1)* %r 29 ret void 30} 31 32; GCN-LABEL: {{^}}fma_f16_imm_a 33; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 34; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] 35; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x4200{{$}} 36; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 37; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] 38; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]] 39; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 40; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}} 41; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] 42; GCN: buffer_store_short v[[R_F16]] 43; GCN: s_endpgm 44define void @fma_f16_imm_a( 45 half addrspace(1)* %r, 46 half addrspace(1)* %b, 47 half addrspace(1)* %c) { 48 %b.val = load half, half addrspace(1)* %b 49 %c.val = load half, half addrspace(1)* %c 50 %r.val = call half @llvm.fma.f16(half 3.0, half %b.val, half %c.val) 51 store half %r.val, half addrspace(1)* %r 52 ret void 53} 54 55; GCN-LABEL: {{^}}fma_f16_imm_b 56; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 57; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] 58; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x4200{{$}} 59; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 60; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] 61; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]] 62; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 63; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}} 64; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]] 65; GCN: buffer_store_short v[[R_F16]] 66; GCN: s_endpgm 67define void @fma_f16_imm_b( 68 half addrspace(1)* %r, 69 half addrspace(1)* %a, 70 half addrspace(1)* %c) { 71 %a.val = load half, half addrspace(1)* %a 72 %c.val = load half, half addrspace(1)* %c 73 %r.val = call half @llvm.fma.f16(half %a.val, half 3.0, half %c.val) 74 store half %r.val, half addrspace(1)* %r 75 ret void 76} 77 78; GCN-LABEL: {{^}}fma_f16_imm_c 79; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 80; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 81; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], 0x4200{{$}} 82; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 83; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 84; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]] 85; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 86; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}} 87; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]] 88; GCN: buffer_store_short v[[R_F16]] 89; GCN: s_endpgm 90define void @fma_f16_imm_c( 91 half addrspace(1)* %r, 92 half addrspace(1)* %a, 93 half addrspace(1)* %b) { 94 %a.val = load half, half addrspace(1)* %a 95 %b.val = load half, half addrspace(1)* %b 96 %r.val = call half @llvm.fma.f16(half %a.val, half %b.val, half 3.0) 97 store half %r.val, half addrspace(1)* %r 98 ret void 99} 100 101; GCN-LABEL: {{^}}fma_v2f16 102; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] 103; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] 104; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]] 105; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 106; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] 107; GCN: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] 108; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] 109; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] 110; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] 111; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] 112; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] 113; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] 114; SI: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32_0]] 115; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] 116; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]] 117; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] 118; VI: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] 119; VI: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]] 120; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] 121; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 122; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] 123; GCN: buffer_store_dword v[[R_V2_F16]] 124; GCN: s_endpgm 125define void @fma_v2f16( 126 <2 x half> addrspace(1)* %r, 127 <2 x half> addrspace(1)* %a, 128 <2 x half> addrspace(1)* %b, 129 <2 x half> addrspace(1)* %c) { 130 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 131 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 132 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 133 %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> %a.val, <2 x half> %b.val, <2 x half> %c.val) 134 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 135 ret void 136} 137 138; GCN-LABEL: {{^}}fma_v2f16_imm_a 139; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] 140; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]] 141; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x4200{{$}} 142; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}} 143; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] 144; GCN: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] 145; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] 146; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] 147; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] 148; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] 149; SI: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32]], v[[C_F32_0]] 150; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] 151; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32]], v[[C_F32_1]] 152; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] 153; VI: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_F16]], v[[C_V2_F16]] 154; VI: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16]], v[[C_F16_1]] 155; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] 156; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 157; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] 158; GCN: buffer_store_dword v[[R_V2_F16]] 159; GCN: s_endpgm 160define void @fma_v2f16_imm_a( 161 <2 x half> addrspace(1)* %r, 162 <2 x half> addrspace(1)* %b, 163 <2 x half> addrspace(1)* %c) { 164 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 165 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 166 %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> <half 3.0, half 3.0>, <2 x half> %b.val, <2 x half> %c.val) 167 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 168 ret void 169} 170 171; GCN-LABEL: {{^}}fma_v2f16_imm_b 172; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] 173; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]] 174; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x4200{{$}} 175; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}} 176; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 177; GCN: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] 178; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] 179; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] 180; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] 181; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] 182; SI: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32]], v[[C_F32_0]] 183; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] 184; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32]], v[[C_F32_1]] 185; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] 186; VI: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_F16]], v[[C_V2_F16]] 187; VI: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16]], v[[C_F16_1]] 188; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] 189; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 190; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] 191; GCN: buffer_store_dword v[[R_V2_F16]] 192; GCN: s_endpgm 193define void @fma_v2f16_imm_b( 194 <2 x half> addrspace(1)* %r, 195 <2 x half> addrspace(1)* %a, 196 <2 x half> addrspace(1)* %c) { 197 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 198 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 199 %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> %a.val, <2 x half> <half 3.0, half 3.0>, <2 x half> %c.val) 200 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 201 ret void 202} 203 204; GCN-LABEL: {{^}}fma_v2f16_imm_c 205; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] 206; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] 207; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], 0x4200{{$}} 208; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}} 209; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 210; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] 211; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] 212; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] 213; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] 214; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] 215; SI: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32]] 216; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] 217; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32]] 218; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] 219; VI: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_F16]] 220; VI: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]] 221; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] 222; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 223; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] 224; GCN: buffer_store_dword v[[R_V2_F16]] 225; GCN: s_endpgm 226define void @fma_v2f16_imm_c( 227 <2 x half> addrspace(1)* %r, 228 <2 x half> addrspace(1)* %a, 229 <2 x half> addrspace(1)* %b) { 230 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 231 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 232 %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> %a.val, <2 x half> %b.val, <2 x half> <half 3.0, half 3.0>) 233 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 234 ret void 235} 236