1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s 2; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s 3 4; GCN-LABEL: {{^}}fsub_f16 5; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 6; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 7; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 8; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 9; SI: v_subrev_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] 10; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 11; VI: v_subrev_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] 12; GCN: buffer_store_short v[[R_F16]] 13; GCN: s_endpgm 14define void @fsub_f16( 15 half addrspace(1)* %r, 16 half addrspace(1)* %a, 17 half addrspace(1)* %b) { 18entry: 19 %a.val = load half, half addrspace(1)* %a 20 %b.val = load half, half addrspace(1)* %b 21 %r.val = fsub half %a.val, %b.val 22 store half %r.val, half addrspace(1)* %r 23 ret void 24} 25 26; GCN-LABEL: {{^}}fsub_f16_imm_a 27; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 28; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x3c00{{$}} 29; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 30; SI: v_subrev_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] 31; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 32; VI: v_sub_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]] 33; GCN: buffer_store_short v[[R_F16]] 34; GCN: s_endpgm 35define void @fsub_f16_imm_a( 36 half addrspace(1)* %r, 37 half addrspace(1)* %b) { 38entry: 39 %b.val = load half, half addrspace(1)* %b 40 %r.val = fsub half 1.0, %b.val 41 store half %r.val, half addrspace(1)* %r 42 ret void 43} 44 45; GCN-LABEL: {{^}}fsub_f16_imm_b 46; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 47; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0xc000{{$}} 48; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 49; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] 50; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 51; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], -2.0, v[[A_F16]] 52; GCN: buffer_store_short v[[R_F16]] 53; GCN: s_endpgm 54define void @fsub_f16_imm_b( 55 half addrspace(1)* %r, 56 half addrspace(1)* %a) { 57entry: 58 %a.val = load half, half addrspace(1)* %a 59 %r.val = fsub half %a.val, 2.0 60 store half %r.val, half addrspace(1)* %r 61 ret void 62} 63 64; GCN-LABEL: {{^}}fsub_v2f16 65; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] 66; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] 67; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 68; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] 69; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] 70; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] 71; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] 72; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] 73; SI: v_subrev_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] 74; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] 75; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] 76; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] 77; VI: v_subrev_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] 78; VI: v_subrev_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]] 79; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] 80; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 81; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] 82; GCN: buffer_store_dword v[[R_V2_F16]] 83; GCN: s_endpgm 84define void @fsub_v2f16( 85 <2 x half> addrspace(1)* %r, 86 <2 x half> addrspace(1)* %a, 87 <2 x half> addrspace(1)* %b) { 88entry: 89 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 90 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 91 %r.val = fsub <2 x half> %a.val, %b.val 92 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 93 ret void 94} 95 96; GCN-LABEL: {{^}}fsub_v2f16_imm_a 97; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] 98; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], 0x3c00{{$}} 99; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], 0x4000{{$}} 100; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] 101; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] 102; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] 103; SI: v_subrev_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] 104; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] 105; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] 106; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] 107; VI: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]] 108; VI: v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]] 109; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] 110; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 111; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] 112; GCN: buffer_store_dword v[[R_V2_F16]] 113; GCN: s_endpgm 114define void @fsub_v2f16_imm_a( 115 <2 x half> addrspace(1)* %r, 116 <2 x half> addrspace(1)* %b) { 117entry: 118 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 119 %r.val = fsub <2 x half> <half 1.0, half 2.0>, %b.val 120 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 121 ret void 122} 123 124; GCN-LABEL: {{^}}fsub_v2f16_imm_b 125; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] 126; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], 0x4000{{$}} 127; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], 0x3c00{{$}} 128; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] 129; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 130; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] 131; SI: v_subrev_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] 132; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] 133; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] 134; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] 135; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]] 136; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], -1.0, v[[A_F16_1]] 137; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] 138; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 139; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] 140; GCN: buffer_store_dword v[[R_V2_F16]] 141; GCN: s_endpgm 142define void @fsub_v2f16_imm_b( 143 <2 x half> addrspace(1)* %r, 144 <2 x half> addrspace(1)* %a) { 145entry: 146 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 147 %r.val = fsub <2 x half> %a.val, <half 2.0, half 1.0> 148 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 149 ret void 150} 151