1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s 2; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s 3 4; GCN-LABEL: {{^}}select_f16: 5; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 6; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 7; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] 8; GCN: buffer_load_ushort v[[D_F16:[0-9]+]] 9; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 10; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 11; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] 12; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] 13; SI: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]] 14; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]] 15; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 16; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] 17; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc 18; GCN: buffer_store_short v[[R_F16]] 19; GCN: s_endpgm 20define amdgpu_kernel void @select_f16( 21 half addrspace(1)* %r, 22 half addrspace(1)* %a, 23 half addrspace(1)* %b, 24 half addrspace(1)* %c, 25 half addrspace(1)* %d) { 26entry: 27 %a.val = load half, half addrspace(1)* %a 28 %b.val = load half, half addrspace(1)* %b 29 %c.val = load half, half addrspace(1)* %c 30 %d.val = load half, half addrspace(1)* %d 31 %fcmp = fcmp olt half %a.val, %b.val 32 %r.val = select i1 %fcmp, half %c.val, half %d.val 33 store half %r.val, half addrspace(1)* %r 34 ret void 35} 36 37; GCN-LABEL: {{^}}select_f16_imm_a: 38; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 39; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] 40; GCN: buffer_load_ushort v[[D_F16:[0-9]+]] 41; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 42; SI: v_cmp_lt_f32_e32 vcc, 0.5, v[[B_F32]] 43; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] 44; SI: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]] 45; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]] 46; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 47; VI: v_cmp_lt_f16_e32 vcc, 0.5, v[[B_F16]] 48; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc 49; GCN: buffer_store_short v[[R_F16]] 50; GCN: s_endpgm 51define amdgpu_kernel void @select_f16_imm_a( 52 half addrspace(1)* %r, 53 half addrspace(1)* %b, 54 half addrspace(1)* %c, 55 half addrspace(1)* %d) { 56entry: 57 %b.val = load half, half addrspace(1)* %b 58 %c.val = load half, half addrspace(1)* %c 59 %d.val = load half, half addrspace(1)* %d 60 %fcmp = fcmp olt half 0xH3800, %b.val 61 %r.val = select i1 %fcmp, half %c.val, half %d.val 62 store half %r.val, half addrspace(1)* %r 63 ret void 64} 65 66; GCN-LABEL: {{^}}select_f16_imm_b: 67; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 68; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] 69; GCN: buffer_load_ushort v[[D_F16:[0-9]+]] 70; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 71; SI: v_cmp_gt_f32_e32 vcc, 0.5, v[[A_F32]] 72; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] 73; SI: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]] 74; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]] 75; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 76 77; VI: v_cmp_gt_f16_e32 vcc, 0.5, v[[A_F16]] 78; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc 79; GCN: buffer_store_short v[[R_F16]] 80; GCN: s_endpgm 81define amdgpu_kernel void @select_f16_imm_b( 82 half addrspace(1)* %r, 83 half addrspace(1)* %a, 84 half addrspace(1)* %c, 85 half addrspace(1)* %d) { 86entry: 87 %a.val = load half, half addrspace(1)* %a 88 %c.val = load half, half addrspace(1)* %c 89 %d.val = load half, half addrspace(1)* %d 90 %fcmp = fcmp olt half %a.val, 0xH3800 91 %r.val = select i1 %fcmp, half %c.val, half %d.val 92 store half %r.val, half addrspace(1)* %r 93 ret void 94} 95 96; GCN-LABEL: {{^}}select_f16_imm_c: 97; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 98; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 99; GCN: buffer_load_ushort v[[D_F16:[0-9]+]] 100; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 101; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 102; SI: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]] 103; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] 104; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[D_F32]], vcc 105; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 106 107; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] 108; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x3800{{$}} 109; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[C_F16]], v[[D_F16]], vcc 110; GCN: buffer_store_short v[[R_F16]] 111; GCN: s_endpgm 112define amdgpu_kernel void @select_f16_imm_c( 113 half addrspace(1)* %r, 114 half addrspace(1)* %a, 115 half addrspace(1)* %b, 116 half addrspace(1)* %d) { 117entry: 118 %a.val = load half, half addrspace(1)* %a 119 %b.val = load half, half addrspace(1)* %b 120 %d.val = load half, half addrspace(1)* %d 121 %fcmp = fcmp olt half %a.val, %b.val 122 %r.val = select i1 %fcmp, half 0xH3800, half %d.val 123 store half %r.val, half addrspace(1)* %r 124 ret void 125} 126 127; GCN-LABEL: {{^}}select_f16_imm_d: 128; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 129; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 130; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] 131; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 132; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 133; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] 134; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] 135; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[C_F32]] 136; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 137; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] 138; VI: v_mov_b32_e32 v[[D_F16:[0-9]+]], 0x3800{{$}} 139; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc 140; GCN: buffer_store_short v[[R_F16]] 141; GCN: s_endpgm 142define amdgpu_kernel void @select_f16_imm_d( 143 half addrspace(1)* %r, 144 half addrspace(1)* %a, 145 half addrspace(1)* %b, 146 half addrspace(1)* %c) { 147entry: 148 %a.val = load half, half addrspace(1)* %a 149 %b.val = load half, half addrspace(1)* %b 150 %c.val = load half, half addrspace(1)* %c 151 %fcmp = fcmp olt half %a.val, %b.val 152 %r.val = select i1 %fcmp, half %c.val, half 0xH3800 153 store half %r.val, half addrspace(1)* %r 154 ret void 155} 156 157; GCN-LABEL: {{^}}select_v2f16: 158; SI: v_cvt_f32_f16_e32 159; SI: v_cvt_f32_f16_e32 160; SI: v_cvt_f32_f16_e32 161; SI: v_cvt_f32_f16_e32 162; SI: v_cmp_lt_f32_e64 163; SI: v_cmp_lt_f32_e32 164; SI: v_cndmask_b32_e32 165; SI: v_cndmask_b32_e64 166; SI: v_cvt_f16_f32_e32 167; SI: v_cvt_f16_f32_e32 168 169; VI: v_cmp_lt_f16_e64 170; VI: v_cmp_lt_f16_e32 171; VI: v_cndmask_b32_e64 172; VI: v_cndmask_b32_e32 173 174; GCN: s_endpgm 175define amdgpu_kernel void @select_v2f16( 176 <2 x half> addrspace(1)* %r, 177 <2 x half> addrspace(1)* %a, 178 <2 x half> addrspace(1)* %b, 179 <2 x half> addrspace(1)* %c, 180 <2 x half> addrspace(1)* %d) { 181entry: 182 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 183 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 184 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 185 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 186 %fcmp = fcmp olt <2 x half> %a.val, %b.val 187 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val 188 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 189 ret void 190} 191 192; GCN-LABEL: {{^}}select_v2f16_imm_a: 193; SI: v_cvt_f32_f16_e32 194; SI: v_cvt_f32_f16_e32 195; SI: v_cvt_f32_f16_e32 196; SI: v_cvt_f32_f16_e32 197; SI: v_cvt_f32_f16_e32 198; SI: v_cvt_f32_f16_e32 199; SI-DAG: v_cmp_gt_f32_e64 200; SI-DAG: v_cmp_lt_f32_e32 vcc, 0.5 201 202; VI: v_cmp_lt_f16_e32 203; VI: v_cmp_gt_f16_e64 204; GCN: v_cndmask_b32_e32 205; GCN: v_cndmask_b32_e64 206; SI: v_cvt_f16_f32_e32 207; SI: v_cvt_f16_f32_e32 208; GCN: s_endpgm 209define amdgpu_kernel void @select_v2f16_imm_a( 210 <2 x half> addrspace(1)* %r, 211 <2 x half> addrspace(1)* %b, 212 <2 x half> addrspace(1)* %c, 213 <2 x half> addrspace(1)* %d) { 214entry: 215 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 216 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 217 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 218 %fcmp = fcmp olt <2 x half> <half 0xH3800, half 0xH3900>, %b.val 219 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val 220 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 221 ret void 222} 223 224; GCN-LABEL: {{^}}select_v2f16_imm_b: 225; SI: v_cvt_f32_f16_e32 226; SI: v_cvt_f32_f16_e32 227; SI: v_cvt_f32_f16_e32 228; SI: v_cvt_f32_f16_e32 229; SI: v_cvt_f32_f16_e32 230; SI: v_cvt_f32_f16_e32 231; SI-DAG: v_cmp_lt_f32_e64 232; SI-DAG: v_cmp_gt_f32_e32 vcc, 0.5 233 234; VI: v_cmp_gt_f16_e32 235; VI: v_cmp_lt_f16_e64 236; GCN: v_cndmask_b32_e32 237; GCN: v_cndmask_b32_e64 238 239; SI: v_cvt_f16_f32_e32 240; SI: v_cvt_f16_f32_e32 241; GCN: s_endpgm 242define amdgpu_kernel void @select_v2f16_imm_b( 243 <2 x half> addrspace(1)* %r, 244 <2 x half> addrspace(1)* %a, 245 <2 x half> addrspace(1)* %c, 246 <2 x half> addrspace(1)* %d) { 247entry: 248 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 249 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 250 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 251 %fcmp = fcmp olt <2 x half> %a.val, <half 0xH3800, half 0xH3900> 252 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val 253 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 254 ret void 255} 256 257; GCN-LABEL: {{^}}select_v2f16_imm_c: 258; SI: v_cvt_f32_f16_e32 259; SI: v_cvt_f32_f16_e32 260; SI: v_cvt_f32_f16_e32 261; SI: v_cvt_f32_f16_e32 262; SI: v_cvt_f32_f16_e32 263; SI: v_cvt_f32_f16_e32 264 265; SI: v_cmp_nlt_f32_e32 266; SI: v_cmp_nlt_f32_e64 267; SI: v_cndmask_b32_e64 268; SI: v_cndmask_b32_e32 269 270; VI: v_cmp_nlt_f16_e32 271; VI: v_cndmask_b32_e32 272 273; VI: v_cmp_nlt_f16_e32 274; VI: v_cndmask_b32_e32 275 276; SI: v_cvt_f16_f32_e32 277; SI: v_cvt_f16_f32_e32 278; GCN: s_endpgm 279define amdgpu_kernel void @select_v2f16_imm_c( 280 <2 x half> addrspace(1)* %r, 281 <2 x half> addrspace(1)* %a, 282 <2 x half> addrspace(1)* %b, 283 <2 x half> addrspace(1)* %d) { 284entry: 285 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 286 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 287 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 288 %fcmp = fcmp olt <2 x half> %a.val, %b.val 289 %r.val = select <2 x i1> %fcmp, <2 x half> <half 0xH3800, half 0xH3900>, <2 x half> %d.val 290 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 291 ret void 292} 293 294; GCN-LABEL: {{^}}select_v2f16_imm_d: 295; SI: v_cvt_f32_f16_e32 296; SI: v_cvt_f32_f16_e32 297; SI: v_cvt_f32_f16_e32 298; SI: v_cvt_f32_f16_e32 299; SI: v_cvt_f32_f16_e32 300; SI: v_cvt_f32_f16_e32 301; SI: v_cmp_lt_f32_e64 302; SI: v_cmp_lt_f32_e32 303 304; VI: v_cmp_lt_f16_e32 305; VI: v_cmp_lt_f16_e64 306; GCN: v_cndmask_b32 307; GCN: v_cndmask_b32 308; SI: v_cvt_f16_f32_e32 309; SI: v_cvt_f16_f32_e32 310; GCN: s_endpgm 311define amdgpu_kernel void @select_v2f16_imm_d( 312 <2 x half> addrspace(1)* %r, 313 <2 x half> addrspace(1)* %a, 314 <2 x half> addrspace(1)* %b, 315 <2 x half> addrspace(1)* %c) { 316entry: 317 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 318 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 319 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 320 %fcmp = fcmp olt <2 x half> %a.val, %b.val 321 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> <half 0xH3800, half 0xH3900> 322 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 323 ret void 324} 325