1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI 3; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI 4 5define amdgpu_kernel void @select_f16( 6; SI-LABEL: select_f16: 7; SI: ; %bb.0: ; %entry 8; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 9; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 10; SI-NEXT: s_mov_b32 s3, 0xf000 11; SI-NEXT: s_mov_b32 s2, -1 12; SI-NEXT: s_mov_b32 s18, s2 13; SI-NEXT: s_waitcnt lgkmcnt(0) 14; SI-NEXT: s_mov_b32 s16, s6 15; SI-NEXT: s_mov_b32 s17, s7 16; SI-NEXT: s_mov_b32 s19, s3 17; SI-NEXT: s_mov_b32 s20, s8 18; SI-NEXT: s_mov_b32 s21, s9 19; SI-NEXT: s_mov_b32 s8, s10 20; SI-NEXT: s_mov_b32 s9, s11 21; SI-NEXT: s_mov_b32 s22, s2 22; SI-NEXT: s_mov_b32 s23, s3 23; SI-NEXT: s_mov_b32 s10, s2 24; SI-NEXT: s_mov_b32 s11, s3 25; SI-NEXT: s_mov_b32 s14, s2 26; SI-NEXT: s_mov_b32 s15, s3 27; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc 28; SI-NEXT: s_waitcnt vmcnt(0) 29; SI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc 30; SI-NEXT: s_waitcnt vmcnt(0) 31; SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc 32; SI-NEXT: s_waitcnt vmcnt(0) 33; SI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 glc 34; SI-NEXT: s_waitcnt vmcnt(0) 35; SI-NEXT: s_mov_b32 s0, s4 36; SI-NEXT: s_mov_b32 s1, s5 37; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 38; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 39; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 40; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 41; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 42; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 43; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 44; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 45; SI-NEXT: s_endpgm 46; 47; VI-LABEL: select_f16: 48; VI: ; %bb.0: ; %entry 49; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 50; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 51; VI-NEXT: s_mov_b32 s3, 0xf000 52; VI-NEXT: s_mov_b32 s2, -1 53; VI-NEXT: s_mov_b32 s18, s2 54; VI-NEXT: s_waitcnt lgkmcnt(0) 55; VI-NEXT: s_mov_b32 s16, s6 56; VI-NEXT: s_mov_b32 s17, s7 57; VI-NEXT: s_mov_b32 s19, s3 58; VI-NEXT: s_mov_b32 s20, s8 59; VI-NEXT: s_mov_b32 s21, s9 60; VI-NEXT: s_mov_b32 s8, s10 61; VI-NEXT: s_mov_b32 s9, s11 62; VI-NEXT: s_mov_b32 s22, s2 63; VI-NEXT: s_mov_b32 s23, s3 64; VI-NEXT: s_mov_b32 s10, s2 65; VI-NEXT: s_mov_b32 s11, s3 66; VI-NEXT: s_mov_b32 s14, s2 67; VI-NEXT: s_mov_b32 s15, s3 68; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc 69; VI-NEXT: s_waitcnt vmcnt(0) 70; VI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc 71; VI-NEXT: s_waitcnt vmcnt(0) 72; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc 73; VI-NEXT: s_waitcnt vmcnt(0) 74; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 glc 75; VI-NEXT: s_waitcnt vmcnt(0) 76; VI-NEXT: s_mov_b32 s0, s4 77; VI-NEXT: s_mov_b32 s1, s5 78; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 79; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 80; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 81; VI-NEXT: s_endpgm 82 half addrspace(1)* %r, 83 half addrspace(1)* %a, 84 half addrspace(1)* %b, 85 half addrspace(1)* %c, 86 half addrspace(1)* %d) { 87entry: 88 %a.val = load volatile half, half addrspace(1)* %a 89 %b.val = load volatile half, half addrspace(1)* %b 90 %c.val = load volatile half, half addrspace(1)* %c 91 %d.val = load volatile half, half addrspace(1)* %d 92 %fcmp = fcmp olt half %a.val, %b.val 93 %r.val = select i1 %fcmp, half %c.val, half %d.val 94 store half %r.val, half addrspace(1)* %r 95 ret void 96} 97 98define amdgpu_kernel void @select_f16_imm_a( 99; SI-LABEL: select_f16_imm_a: 100; SI: ; %bb.0: ; %entry 101; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 102; SI-NEXT: s_mov_b32 s11, 0xf000 103; SI-NEXT: s_mov_b32 s10, -1 104; SI-NEXT: s_mov_b32 s14, s10 105; SI-NEXT: s_mov_b32 s15, s11 106; SI-NEXT: s_waitcnt lgkmcnt(0) 107; SI-NEXT: s_mov_b32 s12, s2 108; SI-NEXT: s_mov_b32 s13, s3 109; SI-NEXT: s_mov_b32 s16, s4 110; SI-NEXT: s_mov_b32 s17, s5 111; SI-NEXT: s_mov_b32 s4, s6 112; SI-NEXT: s_mov_b32 s5, s7 113; SI-NEXT: s_mov_b32 s18, s10 114; SI-NEXT: s_mov_b32 s19, s11 115; SI-NEXT: s_mov_b32 s6, s10 116; SI-NEXT: s_mov_b32 s7, s11 117; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 118; SI-NEXT: s_waitcnt vmcnt(0) 119; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc 120; SI-NEXT: s_waitcnt vmcnt(0) 121; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc 122; SI-NEXT: s_waitcnt vmcnt(0) 123; SI-NEXT: s_mov_b32 s8, s0 124; SI-NEXT: s_mov_b32 s9, s1 125; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 126; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 127; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 128; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0 129; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 130; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 131; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 132; SI-NEXT: s_endpgm 133; 134; VI-LABEL: select_f16_imm_a: 135; VI: ; %bb.0: ; %entry 136; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 137; VI-NEXT: s_mov_b32 s11, 0xf000 138; VI-NEXT: s_mov_b32 s10, -1 139; VI-NEXT: s_mov_b32 s14, s10 140; VI-NEXT: s_mov_b32 s15, s11 141; VI-NEXT: s_waitcnt lgkmcnt(0) 142; VI-NEXT: s_mov_b32 s12, s2 143; VI-NEXT: s_mov_b32 s13, s3 144; VI-NEXT: s_mov_b32 s16, s4 145; VI-NEXT: s_mov_b32 s17, s5 146; VI-NEXT: s_mov_b32 s4, s6 147; VI-NEXT: s_mov_b32 s5, s7 148; VI-NEXT: s_mov_b32 s18, s10 149; VI-NEXT: s_mov_b32 s19, s11 150; VI-NEXT: s_mov_b32 s6, s10 151; VI-NEXT: s_mov_b32 s7, s11 152; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 153; VI-NEXT: s_waitcnt vmcnt(0) 154; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc 155; VI-NEXT: s_waitcnt vmcnt(0) 156; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc 157; VI-NEXT: s_waitcnt vmcnt(0) 158; VI-NEXT: s_mov_b32 s8, s0 159; VI-NEXT: s_mov_b32 s9, s1 160; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0 161; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 162; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 163; VI-NEXT: s_endpgm 164 half addrspace(1)* %r, 165 half addrspace(1)* %b, 166 half addrspace(1)* %c, 167 half addrspace(1)* %d) { 168entry: 169 %b.val = load volatile half, half addrspace(1)* %b 170 %c.val = load volatile half, half addrspace(1)* %c 171 %d.val = load volatile half, half addrspace(1)* %d 172 %fcmp = fcmp olt half 0xH3800, %b.val 173 %r.val = select i1 %fcmp, half %c.val, half %d.val 174 store half %r.val, half addrspace(1)* %r 175 ret void 176} 177 178define amdgpu_kernel void @select_f16_imm_b( 179; SI-LABEL: select_f16_imm_b: 180; SI: ; %bb.0: ; %entry 181; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 182; SI-NEXT: s_mov_b32 s11, 0xf000 183; SI-NEXT: s_mov_b32 s10, -1 184; SI-NEXT: s_mov_b32 s14, s10 185; SI-NEXT: s_mov_b32 s15, s11 186; SI-NEXT: s_waitcnt lgkmcnt(0) 187; SI-NEXT: s_mov_b32 s12, s2 188; SI-NEXT: s_mov_b32 s13, s3 189; SI-NEXT: s_mov_b32 s16, s4 190; SI-NEXT: s_mov_b32 s17, s5 191; SI-NEXT: s_mov_b32 s4, s6 192; SI-NEXT: s_mov_b32 s5, s7 193; SI-NEXT: s_mov_b32 s18, s10 194; SI-NEXT: s_mov_b32 s19, s11 195; SI-NEXT: s_mov_b32 s6, s10 196; SI-NEXT: s_mov_b32 s7, s11 197; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 198; SI-NEXT: s_waitcnt vmcnt(0) 199; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc 200; SI-NEXT: s_waitcnt vmcnt(0) 201; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc 202; SI-NEXT: s_waitcnt vmcnt(0) 203; SI-NEXT: s_mov_b32 s8, s0 204; SI-NEXT: s_mov_b32 s9, s1 205; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 206; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 207; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 208; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0 209; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 210; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 211; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 212; SI-NEXT: s_endpgm 213; 214; VI-LABEL: select_f16_imm_b: 215; VI: ; %bb.0: ; %entry 216; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 217; VI-NEXT: s_mov_b32 s11, 0xf000 218; VI-NEXT: s_mov_b32 s10, -1 219; VI-NEXT: s_mov_b32 s14, s10 220; VI-NEXT: s_mov_b32 s15, s11 221; VI-NEXT: s_waitcnt lgkmcnt(0) 222; VI-NEXT: s_mov_b32 s12, s2 223; VI-NEXT: s_mov_b32 s13, s3 224; VI-NEXT: s_mov_b32 s16, s4 225; VI-NEXT: s_mov_b32 s17, s5 226; VI-NEXT: s_mov_b32 s4, s6 227; VI-NEXT: s_mov_b32 s5, s7 228; VI-NEXT: s_mov_b32 s18, s10 229; VI-NEXT: s_mov_b32 s19, s11 230; VI-NEXT: s_mov_b32 s6, s10 231; VI-NEXT: s_mov_b32 s7, s11 232; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 233; VI-NEXT: s_waitcnt vmcnt(0) 234; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc 235; VI-NEXT: s_waitcnt vmcnt(0) 236; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc 237; VI-NEXT: s_waitcnt vmcnt(0) 238; VI-NEXT: s_mov_b32 s8, s0 239; VI-NEXT: s_mov_b32 s9, s1 240; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0 241; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 242; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 243; VI-NEXT: s_endpgm 244 half addrspace(1)* %r, 245 half addrspace(1)* %a, 246 half addrspace(1)* %c, 247 half addrspace(1)* %d) { 248entry: 249 %a.val = load volatile half, half addrspace(1)* %a 250 %c.val = load volatile half, half addrspace(1)* %c 251 %d.val = load volatile half, half addrspace(1)* %d 252 %fcmp = fcmp olt half %a.val, 0xH3800 253 %r.val = select i1 %fcmp, half %c.val, half %d.val 254 store half %r.val, half addrspace(1)* %r 255 ret void 256} 257 258define amdgpu_kernel void @select_f16_imm_c( 259; SI-LABEL: select_f16_imm_c: 260; SI: ; %bb.0: ; %entry 261; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 262; SI-NEXT: s_mov_b32 s11, 0xf000 263; SI-NEXT: s_mov_b32 s10, -1 264; SI-NEXT: s_mov_b32 s14, s10 265; SI-NEXT: s_mov_b32 s15, s11 266; SI-NEXT: s_waitcnt lgkmcnt(0) 267; SI-NEXT: s_mov_b32 s12, s2 268; SI-NEXT: s_mov_b32 s13, s3 269; SI-NEXT: s_mov_b32 s16, s4 270; SI-NEXT: s_mov_b32 s17, s5 271; SI-NEXT: s_mov_b32 s4, s6 272; SI-NEXT: s_mov_b32 s5, s7 273; SI-NEXT: s_mov_b32 s18, s10 274; SI-NEXT: s_mov_b32 s19, s11 275; SI-NEXT: s_mov_b32 s6, s10 276; SI-NEXT: s_mov_b32 s7, s11 277; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 278; SI-NEXT: s_waitcnt vmcnt(0) 279; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc 280; SI-NEXT: s_waitcnt vmcnt(0) 281; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc 282; SI-NEXT: s_waitcnt vmcnt(0) 283; SI-NEXT: s_mov_b32 s8, s0 284; SI-NEXT: s_mov_b32 s9, s1 285; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 286; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 287; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 288; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 289; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc 290; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 291; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 292; SI-NEXT: s_endpgm 293; 294; VI-LABEL: select_f16_imm_c: 295; VI: ; %bb.0: ; %entry 296; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 297; VI-NEXT: s_mov_b32 s11, 0xf000 298; VI-NEXT: s_mov_b32 s10, -1 299; VI-NEXT: s_mov_b32 s14, s10 300; VI-NEXT: s_mov_b32 s15, s11 301; VI-NEXT: s_waitcnt lgkmcnt(0) 302; VI-NEXT: s_mov_b32 s12, s2 303; VI-NEXT: s_mov_b32 s13, s3 304; VI-NEXT: s_mov_b32 s16, s4 305; VI-NEXT: s_mov_b32 s17, s5 306; VI-NEXT: s_mov_b32 s4, s6 307; VI-NEXT: s_mov_b32 s5, s7 308; VI-NEXT: s_mov_b32 s18, s10 309; VI-NEXT: s_mov_b32 s19, s11 310; VI-NEXT: s_mov_b32 s6, s10 311; VI-NEXT: s_mov_b32 s7, s11 312; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 313; VI-NEXT: s_waitcnt vmcnt(0) 314; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc 315; VI-NEXT: s_waitcnt vmcnt(0) 316; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc 317; VI-NEXT: s_waitcnt vmcnt(0) 318; VI-NEXT: v_mov_b32_e32 v3, 0x3800 319; VI-NEXT: s_mov_b32 s8, s0 320; VI-NEXT: s_mov_b32 s9, s1 321; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 322; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 323; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 324; VI-NEXT: s_endpgm 325 half addrspace(1)* %r, 326 half addrspace(1)* %a, 327 half addrspace(1)* %b, 328 half addrspace(1)* %d) { 329entry: 330 %a.val = load volatile half, half addrspace(1)* %a 331 %b.val = load volatile half, half addrspace(1)* %b 332 %d.val = load volatile half, half addrspace(1)* %d 333 %fcmp = fcmp olt half %a.val, %b.val 334 %r.val = select i1 %fcmp, half 0xH3800, half %d.val 335 store half %r.val, half addrspace(1)* %r 336 ret void 337} 338 339define amdgpu_kernel void @select_f16_imm_d( 340; SI-LABEL: select_f16_imm_d: 341; SI: ; %bb.0: ; %entry 342; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 343; SI-NEXT: s_mov_b32 s11, 0xf000 344; SI-NEXT: s_mov_b32 s10, -1 345; SI-NEXT: s_mov_b32 s14, s10 346; SI-NEXT: s_mov_b32 s15, s11 347; SI-NEXT: s_waitcnt lgkmcnt(0) 348; SI-NEXT: s_mov_b32 s12, s2 349; SI-NEXT: s_mov_b32 s13, s3 350; SI-NEXT: s_mov_b32 s16, s4 351; SI-NEXT: s_mov_b32 s17, s5 352; SI-NEXT: s_mov_b32 s4, s6 353; SI-NEXT: s_mov_b32 s5, s7 354; SI-NEXT: s_mov_b32 s18, s10 355; SI-NEXT: s_mov_b32 s19, s11 356; SI-NEXT: s_mov_b32 s6, s10 357; SI-NEXT: s_mov_b32 s7, s11 358; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 359; SI-NEXT: s_waitcnt vmcnt(0) 360; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc 361; SI-NEXT: s_waitcnt vmcnt(0) 362; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc 363; SI-NEXT: s_waitcnt vmcnt(0) 364; SI-NEXT: s_mov_b32 s8, s0 365; SI-NEXT: s_mov_b32 s9, s1 366; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 367; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 368; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 369; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 370; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc 371; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 372; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 373; SI-NEXT: s_endpgm 374; 375; VI-LABEL: select_f16_imm_d: 376; VI: ; %bb.0: ; %entry 377; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 378; VI-NEXT: s_mov_b32 s11, 0xf000 379; VI-NEXT: s_mov_b32 s10, -1 380; VI-NEXT: s_mov_b32 s14, s10 381; VI-NEXT: s_mov_b32 s15, s11 382; VI-NEXT: s_waitcnt lgkmcnt(0) 383; VI-NEXT: s_mov_b32 s12, s2 384; VI-NEXT: s_mov_b32 s13, s3 385; VI-NEXT: s_mov_b32 s16, s4 386; VI-NEXT: s_mov_b32 s17, s5 387; VI-NEXT: s_mov_b32 s4, s6 388; VI-NEXT: s_mov_b32 s5, s7 389; VI-NEXT: s_mov_b32 s18, s10 390; VI-NEXT: s_mov_b32 s19, s11 391; VI-NEXT: s_mov_b32 s6, s10 392; VI-NEXT: s_mov_b32 s7, s11 393; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 394; VI-NEXT: s_waitcnt vmcnt(0) 395; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc 396; VI-NEXT: s_waitcnt vmcnt(0) 397; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc 398; VI-NEXT: s_waitcnt vmcnt(0) 399; VI-NEXT: v_mov_b32_e32 v3, 0x3800 400; VI-NEXT: s_mov_b32 s8, s0 401; VI-NEXT: s_mov_b32 s9, s1 402; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 403; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 404; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 405; VI-NEXT: s_endpgm 406 half addrspace(1)* %r, 407 half addrspace(1)* %a, 408 half addrspace(1)* %b, 409 half addrspace(1)* %c) { 410entry: 411 %a.val = load volatile half, half addrspace(1)* %a 412 %b.val = load volatile half, half addrspace(1)* %b 413 %c.val = load volatile half, half addrspace(1)* %c 414 %fcmp = fcmp olt half %a.val, %b.val 415 %r.val = select i1 %fcmp, half %c.val, half 0xH3800 416 store half %r.val, half addrspace(1)* %r 417 ret void 418} 419 420define amdgpu_kernel void @select_v2f16( 421; SI-LABEL: select_v2f16: 422; SI: ; %bb.0: ; %entry 423; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 424; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 425; SI-NEXT: s_mov_b32 s3, 0xf000 426; SI-NEXT: s_mov_b32 s2, -1 427; SI-NEXT: s_mov_b32 s18, s2 428; SI-NEXT: s_waitcnt lgkmcnt(0) 429; SI-NEXT: s_mov_b32 s16, s6 430; SI-NEXT: s_mov_b32 s17, s7 431; SI-NEXT: s_mov_b32 s19, s3 432; SI-NEXT: s_mov_b32 s20, s8 433; SI-NEXT: s_mov_b32 s21, s9 434; SI-NEXT: s_mov_b32 s8, s10 435; SI-NEXT: s_mov_b32 s9, s11 436; SI-NEXT: s_mov_b32 s22, s2 437; SI-NEXT: s_mov_b32 s23, s3 438; SI-NEXT: s_mov_b32 s10, s2 439; SI-NEXT: s_mov_b32 s11, s3 440; SI-NEXT: s_mov_b32 s14, s2 441; SI-NEXT: s_mov_b32 s15, s3 442; SI-NEXT: buffer_load_dword v0, off, s[16:19], 0 443; SI-NEXT: buffer_load_dword v1, off, s[20:23], 0 444; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 445; SI-NEXT: buffer_load_dword v3, off, s[8:11], 0 446; SI-NEXT: s_mov_b32 s0, s4 447; SI-NEXT: s_mov_b32 s1, s5 448; SI-NEXT: s_waitcnt vmcnt(3) 449; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 450; SI-NEXT: s_waitcnt vmcnt(2) 451; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 452; SI-NEXT: s_waitcnt vmcnt(1) 453; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 454; SI-NEXT: s_waitcnt vmcnt(0) 455; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 456; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 457; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 458; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 459; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 460; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 461; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 462; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 463; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 464; SI-NEXT: v_cmp_lt_f32_e32 vcc, v5, v6 465; SI-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc 466; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 467; SI-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc 468; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 469; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 470; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 471; SI-NEXT: v_or_b32_e32 v0, v0, v1 472; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 473; SI-NEXT: s_endpgm 474; 475; VI-LABEL: select_v2f16: 476; VI: ; %bb.0: ; %entry 477; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 478; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 479; VI-NEXT: s_mov_b32 s3, 0xf000 480; VI-NEXT: s_mov_b32 s2, -1 481; VI-NEXT: s_mov_b32 s18, s2 482; VI-NEXT: s_waitcnt lgkmcnt(0) 483; VI-NEXT: s_mov_b32 s16, s6 484; VI-NEXT: s_mov_b32 s17, s7 485; VI-NEXT: s_mov_b32 s19, s3 486; VI-NEXT: s_mov_b32 s20, s8 487; VI-NEXT: s_mov_b32 s21, s9 488; VI-NEXT: s_mov_b32 s8, s10 489; VI-NEXT: s_mov_b32 s9, s11 490; VI-NEXT: s_mov_b32 s22, s2 491; VI-NEXT: s_mov_b32 s23, s3 492; VI-NEXT: s_mov_b32 s10, s2 493; VI-NEXT: s_mov_b32 s11, s3 494; VI-NEXT: s_mov_b32 s14, s2 495; VI-NEXT: s_mov_b32 s15, s3 496; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 497; VI-NEXT: buffer_load_dword v1, off, s[20:23], 0 498; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0 499; VI-NEXT: buffer_load_dword v3, off, s[8:11], 0 500; VI-NEXT: s_mov_b32 s0, s4 501; VI-NEXT: s_mov_b32 s1, s5 502; VI-NEXT: s_waitcnt vmcnt(3) 503; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 504; VI-NEXT: s_waitcnt vmcnt(2) 505; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 506; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 507; VI-NEXT: s_waitcnt vmcnt(0) 508; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 509; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 510; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 511; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5 512; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc 513; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 514; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 515; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 516; VI-NEXT: s_endpgm 517 <2 x half> addrspace(1)* %r, 518 <2 x half> addrspace(1)* %a, 519 <2 x half> addrspace(1)* %b, 520 <2 x half> addrspace(1)* %c, 521 <2 x half> addrspace(1)* %d) { 522entry: 523 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 524 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 525 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 526 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 527 %fcmp = fcmp olt <2 x half> %a.val, %b.val 528 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val 529 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 530 ret void 531} 532 533define amdgpu_kernel void @select_v2f16_imm_a( 534; SI-LABEL: select_v2f16_imm_a: 535; SI: ; %bb.0: ; %entry 536; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 537; SI-NEXT: s_mov_b32 s11, 0xf000 538; SI-NEXT: s_mov_b32 s10, -1 539; SI-NEXT: s_mov_b32 s14, s10 540; SI-NEXT: s_mov_b32 s15, s11 541; SI-NEXT: s_waitcnt lgkmcnt(0) 542; SI-NEXT: s_mov_b32 s12, s2 543; SI-NEXT: s_mov_b32 s13, s3 544; SI-NEXT: s_mov_b32 s16, s4 545; SI-NEXT: s_mov_b32 s17, s5 546; SI-NEXT: s_mov_b32 s4, s6 547; SI-NEXT: s_mov_b32 s5, s7 548; SI-NEXT: s_mov_b32 s18, s10 549; SI-NEXT: s_mov_b32 s19, s11 550; SI-NEXT: s_mov_b32 s6, s10 551; SI-NEXT: s_mov_b32 s7, s11 552; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 553; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 554; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 555; SI-NEXT: s_mov_b32 s2, 0x3f200000 556; SI-NEXT: s_mov_b32 s8, s0 557; SI-NEXT: s_mov_b32 s9, s1 558; SI-NEXT: s_waitcnt vmcnt(2) 559; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 560; SI-NEXT: s_waitcnt vmcnt(1) 561; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 562; SI-NEXT: s_waitcnt vmcnt(0) 563; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 564; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 565; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 566; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 567; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 568; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 569; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 570; SI-NEXT: v_cmp_lt_f32_e32 vcc, s2, v3 571; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc 572; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0 573; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 574; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 575; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 576; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 577; SI-NEXT: v_or_b32_e32 v0, v0, v1 578; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 579; SI-NEXT: s_endpgm 580; 581; VI-LABEL: select_v2f16_imm_a: 582; VI: ; %bb.0: ; %entry 583; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 584; VI-NEXT: s_mov_b32 s11, 0xf000 585; VI-NEXT: s_mov_b32 s10, -1 586; VI-NEXT: s_mov_b32 s14, s10 587; VI-NEXT: s_mov_b32 s15, s11 588; VI-NEXT: s_waitcnt lgkmcnt(0) 589; VI-NEXT: s_mov_b32 s12, s2 590; VI-NEXT: s_mov_b32 s13, s3 591; VI-NEXT: s_mov_b32 s16, s4 592; VI-NEXT: s_mov_b32 s17, s5 593; VI-NEXT: s_mov_b32 s4, s6 594; VI-NEXT: s_mov_b32 s5, s7 595; VI-NEXT: s_mov_b32 s18, s10 596; VI-NEXT: s_mov_b32 s19, s11 597; VI-NEXT: s_mov_b32 s6, s10 598; VI-NEXT: s_mov_b32 s7, s11 599; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 600; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 601; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 602; VI-NEXT: s_movk_i32 s2, 0x3900 603; VI-NEXT: s_mov_b32 s8, s0 604; VI-NEXT: s_mov_b32 s9, s1 605; VI-NEXT: s_waitcnt vmcnt(2) 606; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 607; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0 608; VI-NEXT: s_waitcnt vmcnt(0) 609; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 610; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 611; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 612; VI-NEXT: v_cmp_lt_f16_e32 vcc, s2, v3 613; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 614; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 615; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 616; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 617; VI-NEXT: s_endpgm 618 <2 x half> addrspace(1)* %r, 619 <2 x half> addrspace(1)* %b, 620 <2 x half> addrspace(1)* %c, 621 <2 x half> addrspace(1)* %d) { 622entry: 623 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 624 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 625 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 626 %fcmp = fcmp olt <2 x half> <half 0xH3800, half 0xH3900>, %b.val 627 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val 628 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 629 ret void 630} 631 632define amdgpu_kernel void @select_v2f16_imm_b( 633; SI-LABEL: select_v2f16_imm_b: 634; SI: ; %bb.0: ; %entry 635; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 636; SI-NEXT: s_mov_b32 s11, 0xf000 637; SI-NEXT: s_mov_b32 s10, -1 638; SI-NEXT: s_mov_b32 s14, s10 639; SI-NEXT: s_mov_b32 s15, s11 640; SI-NEXT: s_waitcnt lgkmcnt(0) 641; SI-NEXT: s_mov_b32 s12, s2 642; SI-NEXT: s_mov_b32 s13, s3 643; SI-NEXT: s_mov_b32 s16, s4 644; SI-NEXT: s_mov_b32 s17, s5 645; SI-NEXT: s_mov_b32 s4, s6 646; SI-NEXT: s_mov_b32 s5, s7 647; SI-NEXT: s_mov_b32 s18, s10 648; SI-NEXT: s_mov_b32 s19, s11 649; SI-NEXT: s_mov_b32 s6, s10 650; SI-NEXT: s_mov_b32 s7, s11 651; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 652; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 653; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 654; SI-NEXT: s_mov_b32 s2, 0x3f200000 655; SI-NEXT: s_mov_b32 s8, s0 656; SI-NEXT: s_mov_b32 s9, s1 657; SI-NEXT: s_waitcnt vmcnt(2) 658; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 659; SI-NEXT: s_waitcnt vmcnt(1) 660; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 661; SI-NEXT: s_waitcnt vmcnt(0) 662; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 663; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 664; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 665; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 666; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 667; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 668; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 669; SI-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3 670; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc 671; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0 672; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 673; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 674; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 675; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 676; SI-NEXT: v_or_b32_e32 v0, v0, v1 677; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 678; SI-NEXT: s_endpgm 679; 680; VI-LABEL: select_v2f16_imm_b: 681; VI: ; %bb.0: ; %entry 682; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 683; VI-NEXT: s_mov_b32 s11, 0xf000 684; VI-NEXT: s_mov_b32 s10, -1 685; VI-NEXT: s_mov_b32 s14, s10 686; VI-NEXT: s_mov_b32 s15, s11 687; VI-NEXT: s_waitcnt lgkmcnt(0) 688; VI-NEXT: s_mov_b32 s12, s2 689; VI-NEXT: s_mov_b32 s13, s3 690; VI-NEXT: s_mov_b32 s16, s4 691; VI-NEXT: s_mov_b32 s17, s5 692; VI-NEXT: s_mov_b32 s4, s6 693; VI-NEXT: s_mov_b32 s5, s7 694; VI-NEXT: s_mov_b32 s18, s10 695; VI-NEXT: s_mov_b32 s19, s11 696; VI-NEXT: s_mov_b32 s6, s10 697; VI-NEXT: s_mov_b32 s7, s11 698; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 699; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 700; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 701; VI-NEXT: s_movk_i32 s2, 0x3900 702; VI-NEXT: s_mov_b32 s8, s0 703; VI-NEXT: s_mov_b32 s9, s1 704; VI-NEXT: s_waitcnt vmcnt(2) 705; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 706; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0 707; VI-NEXT: s_waitcnt vmcnt(0) 708; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 709; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 710; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 711; VI-NEXT: v_cmp_gt_f16_e32 vcc, s2, v3 712; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 713; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 714; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 715; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 716; VI-NEXT: s_endpgm 717 <2 x half> addrspace(1)* %r, 718 <2 x half> addrspace(1)* %a, 719 <2 x half> addrspace(1)* %c, 720 <2 x half> addrspace(1)* %d) { 721entry: 722 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 723 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 724 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 725 %fcmp = fcmp olt <2 x half> %a.val, <half 0xH3800, half 0xH3900> 726 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val 727 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 728 ret void 729} 730 731define amdgpu_kernel void @select_v2f16_imm_c( 732; SI-LABEL: select_v2f16_imm_c: 733; SI: ; %bb.0: ; %entry 734; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 735; SI-NEXT: s_mov_b32 s11, 0xf000 736; SI-NEXT: s_mov_b32 s10, -1 737; SI-NEXT: s_mov_b32 s14, s10 738; SI-NEXT: s_mov_b32 s15, s11 739; SI-NEXT: s_waitcnt lgkmcnt(0) 740; SI-NEXT: s_mov_b32 s12, s2 741; SI-NEXT: s_mov_b32 s13, s3 742; SI-NEXT: s_mov_b32 s16, s4 743; SI-NEXT: s_mov_b32 s17, s5 744; SI-NEXT: s_mov_b32 s4, s6 745; SI-NEXT: s_mov_b32 s5, s7 746; SI-NEXT: s_mov_b32 s18, s10 747; SI-NEXT: s_mov_b32 s19, s11 748; SI-NEXT: s_mov_b32 s6, s10 749; SI-NEXT: s_mov_b32 s7, s11 750; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 751; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 752; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 753; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000 754; SI-NEXT: s_mov_b32 s8, s0 755; SI-NEXT: s_mov_b32 s9, s1 756; SI-NEXT: s_waitcnt vmcnt(2) 757; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 758; SI-NEXT: s_waitcnt vmcnt(1) 759; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 760; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 761; SI-NEXT: s_waitcnt vmcnt(0) 762; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 763; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 764; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 765; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 766; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 767; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 768; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v5 769; SI-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc 770; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v1 771; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 772; SI-NEXT: v_cndmask_b32_e32 v1, 0.5, v2, vcc 773; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 774; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 775; SI-NEXT: v_or_b32_e32 v0, v1, v0 776; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 777; SI-NEXT: s_endpgm 778; 779; VI-LABEL: select_v2f16_imm_c: 780; VI: ; %bb.0: ; %entry 781; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 782; VI-NEXT: s_mov_b32 s11, 0xf000 783; VI-NEXT: s_mov_b32 s10, -1 784; VI-NEXT: s_mov_b32 s14, s10 785; VI-NEXT: s_mov_b32 s15, s11 786; VI-NEXT: s_waitcnt lgkmcnt(0) 787; VI-NEXT: s_mov_b32 s12, s2 788; VI-NEXT: s_mov_b32 s13, s3 789; VI-NEXT: s_mov_b32 s16, s4 790; VI-NEXT: s_mov_b32 s17, s5 791; VI-NEXT: s_mov_b32 s4, s6 792; VI-NEXT: s_mov_b32 s5, s7 793; VI-NEXT: s_mov_b32 s18, s10 794; VI-NEXT: s_mov_b32 s19, s11 795; VI-NEXT: s_mov_b32 s6, s10 796; VI-NEXT: s_mov_b32 s7, s11 797; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 798; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 799; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 800; VI-NEXT: v_mov_b32_e32 v3, 0x3800 801; VI-NEXT: v_mov_b32_e32 v4, 0x3900 802; VI-NEXT: s_mov_b32 s8, s0 803; VI-NEXT: s_mov_b32 s9, s1 804; VI-NEXT: s_waitcnt vmcnt(2) 805; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 806; VI-NEXT: s_waitcnt vmcnt(1) 807; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 808; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 809; VI-NEXT: s_waitcnt vmcnt(0) 810; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 811; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 812; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v6, v5 813; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc 814; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 815; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 816; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 817; VI-NEXT: s_endpgm 818 <2 x half> addrspace(1)* %r, 819 <2 x half> addrspace(1)* %a, 820 <2 x half> addrspace(1)* %b, 821 <2 x half> addrspace(1)* %d) { 822entry: 823 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 824 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 825 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 826 %fcmp = fcmp olt <2 x half> %a.val, %b.val 827 %r.val = select <2 x i1> %fcmp, <2 x half> <half 0xH3800, half 0xH3900>, <2 x half> %d.val 828 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 829 ret void 830} 831 832define amdgpu_kernel void @select_v2f16_imm_d( 833; SI-LABEL: select_v2f16_imm_d: 834; SI: ; %bb.0: ; %entry 835; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 836; SI-NEXT: s_mov_b32 s11, 0xf000 837; SI-NEXT: s_mov_b32 s10, -1 838; SI-NEXT: s_mov_b32 s14, s10 839; SI-NEXT: s_mov_b32 s15, s11 840; SI-NEXT: s_waitcnt lgkmcnt(0) 841; SI-NEXT: s_mov_b32 s12, s2 842; SI-NEXT: s_mov_b32 s13, s3 843; SI-NEXT: s_mov_b32 s16, s4 844; SI-NEXT: s_mov_b32 s17, s5 845; SI-NEXT: s_mov_b32 s4, s6 846; SI-NEXT: s_mov_b32 s5, s7 847; SI-NEXT: s_mov_b32 s18, s10 848; SI-NEXT: s_mov_b32 s19, s11 849; SI-NEXT: s_mov_b32 s6, s10 850; SI-NEXT: s_mov_b32 s7, s11 851; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 852; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 853; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 854; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000 855; SI-NEXT: s_mov_b32 s8, s0 856; SI-NEXT: s_mov_b32 s9, s1 857; SI-NEXT: s_waitcnt vmcnt(2) 858; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 859; SI-NEXT: s_waitcnt vmcnt(1) 860; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 861; SI-NEXT: s_waitcnt vmcnt(0) 862; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 863; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 864; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 865; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 866; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 867; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 868; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 869; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5 870; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 871; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 872; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc 873; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 874; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 875; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 876; SI-NEXT: v_or_b32_e32 v0, v0, v1 877; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 878; SI-NEXT: s_endpgm 879; 880; VI-LABEL: select_v2f16_imm_d: 881; VI: ; %bb.0: ; %entry 882; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 883; VI-NEXT: s_mov_b32 s11, 0xf000 884; VI-NEXT: s_mov_b32 s10, -1 885; VI-NEXT: s_mov_b32 s14, s10 886; VI-NEXT: s_mov_b32 s15, s11 887; VI-NEXT: s_waitcnt lgkmcnt(0) 888; VI-NEXT: s_mov_b32 s12, s2 889; VI-NEXT: s_mov_b32 s13, s3 890; VI-NEXT: s_mov_b32 s16, s4 891; VI-NEXT: s_mov_b32 s17, s5 892; VI-NEXT: s_mov_b32 s4, s6 893; VI-NEXT: s_mov_b32 s5, s7 894; VI-NEXT: s_mov_b32 s18, s10 895; VI-NEXT: s_mov_b32 s19, s11 896; VI-NEXT: s_mov_b32 s6, s10 897; VI-NEXT: s_mov_b32 s7, s11 898; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 899; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 900; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 901; VI-NEXT: v_mov_b32_e32 v3, 0x3800 902; VI-NEXT: v_mov_b32_e32 v4, 0x3900 903; VI-NEXT: s_mov_b32 s8, s0 904; VI-NEXT: s_mov_b32 s9, s1 905; VI-NEXT: s_waitcnt vmcnt(2) 906; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 907; VI-NEXT: s_waitcnt vmcnt(1) 908; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 909; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 910; VI-NEXT: s_waitcnt vmcnt(0) 911; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 912; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 913; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5 914; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc 915; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 916; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 917; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 918; VI-NEXT: s_endpgm 919 <2 x half> addrspace(1)* %r, 920 <2 x half> addrspace(1)* %a, 921 <2 x half> addrspace(1)* %b, 922 <2 x half> addrspace(1)* %c) { 923entry: 924 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 925 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 926 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 927 %fcmp = fcmp olt <2 x half> %a.val, %b.val 928 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> <half 0xH3800, half 0xH3900> 929 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 930 ret void 931} 932