1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI 3; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI 4 5define amdgpu_kernel void @select_f16( 6; SI-LABEL: select_f16: 7; SI: ; %bb.0: ; %entry 8; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 9; SI-NEXT: s_mov_b32 s3, 0xf000 10; SI-NEXT: s_mov_b32 s2, -1 11; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 12; SI-NEXT: s_mov_b32 s18, s2 13; SI-NEXT: s_waitcnt lgkmcnt(0) 14; SI-NEXT: s_mov_b32 s16, s6 15; SI-NEXT: s_mov_b32 s17, s7 16; SI-NEXT: s_mov_b32 s19, s3 17; SI-NEXT: s_mov_b32 s20, s8 18; SI-NEXT: s_mov_b32 s21, s9 19; SI-NEXT: s_mov_b32 s22, s2 20; SI-NEXT: s_mov_b32 s23, s3 21; SI-NEXT: s_mov_b32 s8, s10 22; SI-NEXT: s_mov_b32 s9, s11 23; SI-NEXT: s_mov_b32 s10, s2 24; SI-NEXT: s_mov_b32 s11, s3 25; SI-NEXT: s_mov_b32 s14, s2 26; SI-NEXT: s_mov_b32 s15, s3 27; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc 28; SI-NEXT: s_waitcnt vmcnt(0) 29; SI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc 30; SI-NEXT: s_waitcnt vmcnt(0) 31; SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc 32; SI-NEXT: s_waitcnt vmcnt(0) 33; SI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 glc 34; SI-NEXT: s_waitcnt vmcnt(0) 35; SI-NEXT: s_mov_b32 s0, s4 36; SI-NEXT: s_mov_b32 s1, s5 37; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 38; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 39; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 40; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 41; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 42; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 43; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 44; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 45; SI-NEXT: s_endpgm 46; 47; VI-LABEL: select_f16: 48; VI: ; %bb.0: ; %entry 49; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 50; VI-NEXT: s_mov_b32 s3, 0xf000 51; VI-NEXT: s_mov_b32 s2, -1 52; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 53; VI-NEXT: s_mov_b32 s18, s2 54; VI-NEXT: s_waitcnt lgkmcnt(0) 55; VI-NEXT: s_mov_b32 s16, s6 56; VI-NEXT: s_mov_b32 s17, s7 57; VI-NEXT: s_mov_b32 s19, s3 58; VI-NEXT: s_mov_b32 s20, s8 59; VI-NEXT: s_mov_b32 s21, s9 60; VI-NEXT: s_mov_b32 s22, s2 61; VI-NEXT: s_mov_b32 s23, s3 62; VI-NEXT: s_mov_b32 s8, s10 63; VI-NEXT: s_mov_b32 s9, s11 64; VI-NEXT: s_mov_b32 s10, s2 65; VI-NEXT: s_mov_b32 s11, s3 66; VI-NEXT: s_mov_b32 s14, s2 67; VI-NEXT: s_mov_b32 s15, s3 68; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc 69; VI-NEXT: s_waitcnt vmcnt(0) 70; VI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc 71; VI-NEXT: s_waitcnt vmcnt(0) 72; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc 73; VI-NEXT: s_waitcnt vmcnt(0) 74; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 glc 75; VI-NEXT: s_waitcnt vmcnt(0) 76; VI-NEXT: s_mov_b32 s0, s4 77; VI-NEXT: s_mov_b32 s1, s5 78; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 79; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 80; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 81; VI-NEXT: s_endpgm 82 half addrspace(1)* %r, 83 half addrspace(1)* %a, 84 half addrspace(1)* %b, 85 half addrspace(1)* %c, 86 half addrspace(1)* %d) { 87entry: 88 %a.val = load volatile half, half addrspace(1)* %a 89 %b.val = load volatile half, half addrspace(1)* %b 90 %c.val = load volatile half, half addrspace(1)* %c 91 %d.val = load volatile half, half addrspace(1)* %d 92 %fcmp = fcmp olt half %a.val, %b.val 93 %r.val = select i1 %fcmp, half %c.val, half %d.val 94 store half %r.val, half addrspace(1)* %r 95 ret void 96} 97 98define amdgpu_kernel void @select_f16_imm_a( 99; SI-LABEL: select_f16_imm_a: 100; SI: ; %bb.0: ; %entry 101; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 102; SI-NEXT: s_mov_b32 s11, 0xf000 103; SI-NEXT: s_mov_b32 s10, -1 104; SI-NEXT: s_mov_b32 s14, s10 105; SI-NEXT: s_mov_b32 s15, s11 106; SI-NEXT: s_waitcnt lgkmcnt(0) 107; SI-NEXT: s_mov_b32 s12, s2 108; SI-NEXT: s_mov_b32 s13, s3 109; SI-NEXT: s_mov_b32 s16, s4 110; SI-NEXT: s_mov_b32 s17, s5 111; SI-NEXT: s_mov_b32 s18, s10 112; SI-NEXT: s_mov_b32 s19, s11 113; SI-NEXT: s_mov_b32 s4, s6 114; SI-NEXT: s_mov_b32 s5, s7 115; SI-NEXT: s_mov_b32 s6, s10 116; SI-NEXT: s_mov_b32 s7, s11 117; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 118; SI-NEXT: s_waitcnt vmcnt(0) 119; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc 120; SI-NEXT: s_waitcnt vmcnt(0) 121; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc 122; SI-NEXT: s_waitcnt vmcnt(0) 123; SI-NEXT: s_mov_b32 s8, s0 124; SI-NEXT: s_mov_b32 s9, s1 125; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 126; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 127; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 128; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0 129; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 130; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 131; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 132; SI-NEXT: s_endpgm 133; 134; VI-LABEL: select_f16_imm_a: 135; VI: ; %bb.0: ; %entry 136; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 137; VI-NEXT: s_mov_b32 s11, 0xf000 138; VI-NEXT: s_mov_b32 s10, -1 139; VI-NEXT: s_mov_b32 s14, s10 140; VI-NEXT: s_mov_b32 s15, s11 141; VI-NEXT: s_waitcnt lgkmcnt(0) 142; VI-NEXT: s_mov_b32 s12, s2 143; VI-NEXT: s_mov_b32 s13, s3 144; VI-NEXT: s_mov_b32 s16, s4 145; VI-NEXT: s_mov_b32 s17, s5 146; VI-NEXT: s_mov_b32 s18, s10 147; VI-NEXT: s_mov_b32 s19, s11 148; VI-NEXT: s_mov_b32 s4, s6 149; VI-NEXT: s_mov_b32 s5, s7 150; VI-NEXT: s_mov_b32 s6, s10 151; VI-NEXT: s_mov_b32 s7, s11 152; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 153; VI-NEXT: s_waitcnt vmcnt(0) 154; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc 155; VI-NEXT: s_waitcnt vmcnt(0) 156; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc 157; VI-NEXT: s_waitcnt vmcnt(0) 158; VI-NEXT: s_mov_b32 s8, s0 159; VI-NEXT: s_mov_b32 s9, s1 160; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0 161; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 162; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 163; VI-NEXT: s_endpgm 164 half addrspace(1)* %r, 165 half addrspace(1)* %b, 166 half addrspace(1)* %c, 167 half addrspace(1)* %d) { 168entry: 169 %b.val = load volatile half, half addrspace(1)* %b 170 %c.val = load volatile half, half addrspace(1)* %c 171 %d.val = load volatile half, half addrspace(1)* %d 172 %fcmp = fcmp olt half 0xH3800, %b.val 173 %r.val = select i1 %fcmp, half %c.val, half %d.val 174 store half %r.val, half addrspace(1)* %r 175 ret void 176} 177 178define amdgpu_kernel void @select_f16_imm_b( 179; SI-LABEL: select_f16_imm_b: 180; SI: ; %bb.0: ; %entry 181; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 182; SI-NEXT: s_mov_b32 s11, 0xf000 183; SI-NEXT: s_mov_b32 s10, -1 184; SI-NEXT: s_mov_b32 s14, s10 185; SI-NEXT: s_mov_b32 s15, s11 186; SI-NEXT: s_waitcnt lgkmcnt(0) 187; SI-NEXT: s_mov_b32 s12, s2 188; SI-NEXT: s_mov_b32 s13, s3 189; SI-NEXT: s_mov_b32 s16, s4 190; SI-NEXT: s_mov_b32 s17, s5 191; SI-NEXT: s_mov_b32 s18, s10 192; SI-NEXT: s_mov_b32 s19, s11 193; SI-NEXT: s_mov_b32 s4, s6 194; SI-NEXT: s_mov_b32 s5, s7 195; SI-NEXT: s_mov_b32 s6, s10 196; SI-NEXT: s_mov_b32 s7, s11 197; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 198; SI-NEXT: s_waitcnt vmcnt(0) 199; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc 200; SI-NEXT: s_waitcnt vmcnt(0) 201; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc 202; SI-NEXT: s_waitcnt vmcnt(0) 203; SI-NEXT: s_mov_b32 s8, s0 204; SI-NEXT: s_mov_b32 s9, s1 205; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 206; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 207; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 208; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0 209; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 210; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 211; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 212; SI-NEXT: s_endpgm 213; 214; VI-LABEL: select_f16_imm_b: 215; VI: ; %bb.0: ; %entry 216; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 217; VI-NEXT: s_mov_b32 s11, 0xf000 218; VI-NEXT: s_mov_b32 s10, -1 219; VI-NEXT: s_mov_b32 s14, s10 220; VI-NEXT: s_mov_b32 s15, s11 221; VI-NEXT: s_waitcnt lgkmcnt(0) 222; VI-NEXT: s_mov_b32 s12, s2 223; VI-NEXT: s_mov_b32 s13, s3 224; VI-NEXT: s_mov_b32 s16, s4 225; VI-NEXT: s_mov_b32 s17, s5 226; VI-NEXT: s_mov_b32 s18, s10 227; VI-NEXT: s_mov_b32 s19, s11 228; VI-NEXT: s_mov_b32 s4, s6 229; VI-NEXT: s_mov_b32 s5, s7 230; VI-NEXT: s_mov_b32 s6, s10 231; VI-NEXT: s_mov_b32 s7, s11 232; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 233; VI-NEXT: s_waitcnt vmcnt(0) 234; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc 235; VI-NEXT: s_waitcnt vmcnt(0) 236; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc 237; VI-NEXT: s_waitcnt vmcnt(0) 238; VI-NEXT: s_mov_b32 s8, s0 239; VI-NEXT: s_mov_b32 s9, s1 240; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0 241; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 242; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 243; VI-NEXT: s_endpgm 244 half addrspace(1)* %r, 245 half addrspace(1)* %a, 246 half addrspace(1)* %c, 247 half addrspace(1)* %d) { 248entry: 249 %a.val = load volatile half, half addrspace(1)* %a 250 %c.val = load volatile half, half addrspace(1)* %c 251 %d.val = load volatile half, half addrspace(1)* %d 252 %fcmp = fcmp olt half %a.val, 0xH3800 253 %r.val = select i1 %fcmp, half %c.val, half %d.val 254 store half %r.val, half addrspace(1)* %r 255 ret void 256} 257 258define amdgpu_kernel void @select_f16_imm_c( 259; SI-LABEL: select_f16_imm_c: 260; SI: ; %bb.0: ; %entry 261; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 262; SI-NEXT: s_mov_b32 s11, 0xf000 263; SI-NEXT: s_mov_b32 s10, -1 264; SI-NEXT: s_mov_b32 s14, s10 265; SI-NEXT: s_mov_b32 s15, s11 266; SI-NEXT: s_waitcnt lgkmcnt(0) 267; SI-NEXT: s_mov_b32 s12, s2 268; SI-NEXT: s_mov_b32 s13, s3 269; SI-NEXT: s_mov_b32 s16, s4 270; SI-NEXT: s_mov_b32 s17, s5 271; SI-NEXT: s_mov_b32 s18, s10 272; SI-NEXT: s_mov_b32 s19, s11 273; SI-NEXT: s_mov_b32 s4, s6 274; SI-NEXT: s_mov_b32 s5, s7 275; SI-NEXT: s_mov_b32 s6, s10 276; SI-NEXT: s_mov_b32 s7, s11 277; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 278; SI-NEXT: s_waitcnt vmcnt(0) 279; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc 280; SI-NEXT: s_waitcnt vmcnt(0) 281; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc 282; SI-NEXT: s_waitcnt vmcnt(0) 283; SI-NEXT: s_mov_b32 s8, s0 284; SI-NEXT: s_mov_b32 s9, s1 285; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 286; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 287; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 288; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 289; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc 290; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 291; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 292; SI-NEXT: s_endpgm 293; 294; VI-LABEL: select_f16_imm_c: 295; VI: ; %bb.0: ; %entry 296; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 297; VI-NEXT: s_mov_b32 s11, 0xf000 298; VI-NEXT: s_mov_b32 s10, -1 299; VI-NEXT: s_mov_b32 s14, s10 300; VI-NEXT: s_mov_b32 s15, s11 301; VI-NEXT: s_waitcnt lgkmcnt(0) 302; VI-NEXT: s_mov_b32 s12, s2 303; VI-NEXT: s_mov_b32 s13, s3 304; VI-NEXT: s_mov_b32 s16, s4 305; VI-NEXT: s_mov_b32 s17, s5 306; VI-NEXT: s_mov_b32 s18, s10 307; VI-NEXT: s_mov_b32 s19, s11 308; VI-NEXT: s_mov_b32 s4, s6 309; VI-NEXT: s_mov_b32 s5, s7 310; VI-NEXT: s_mov_b32 s6, s10 311; VI-NEXT: s_mov_b32 s7, s11 312; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 313; VI-NEXT: s_waitcnt vmcnt(0) 314; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc 315; VI-NEXT: s_waitcnt vmcnt(0) 316; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc 317; VI-NEXT: s_waitcnt vmcnt(0) 318; VI-NEXT: v_mov_b32_e32 v3, 0x3800 319; VI-NEXT: s_mov_b32 s8, s0 320; VI-NEXT: s_mov_b32 s9, s1 321; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 322; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 323; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 324; VI-NEXT: s_endpgm 325 half addrspace(1)* %r, 326 half addrspace(1)* %a, 327 half addrspace(1)* %b, 328 half addrspace(1)* %d) { 329entry: 330 %a.val = load volatile half, half addrspace(1)* %a 331 %b.val = load volatile half, half addrspace(1)* %b 332 %d.val = load volatile half, half addrspace(1)* %d 333 %fcmp = fcmp olt half %a.val, %b.val 334 %r.val = select i1 %fcmp, half 0xH3800, half %d.val 335 store half %r.val, half addrspace(1)* %r 336 ret void 337} 338 339define amdgpu_kernel void @select_f16_imm_d( 340; SI-LABEL: select_f16_imm_d: 341; SI: ; %bb.0: ; %entry 342; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 343; SI-NEXT: s_mov_b32 s11, 0xf000 344; SI-NEXT: s_mov_b32 s10, -1 345; SI-NEXT: s_mov_b32 s14, s10 346; SI-NEXT: s_mov_b32 s15, s11 347; SI-NEXT: s_waitcnt lgkmcnt(0) 348; SI-NEXT: s_mov_b32 s12, s2 349; SI-NEXT: s_mov_b32 s13, s3 350; SI-NEXT: s_mov_b32 s16, s4 351; SI-NEXT: s_mov_b32 s17, s5 352; SI-NEXT: s_mov_b32 s18, s10 353; SI-NEXT: s_mov_b32 s19, s11 354; SI-NEXT: s_mov_b32 s4, s6 355; SI-NEXT: s_mov_b32 s5, s7 356; SI-NEXT: s_mov_b32 s6, s10 357; SI-NEXT: s_mov_b32 s7, s11 358; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 359; SI-NEXT: s_waitcnt vmcnt(0) 360; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc 361; SI-NEXT: s_waitcnt vmcnt(0) 362; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc 363; SI-NEXT: s_waitcnt vmcnt(0) 364; SI-NEXT: s_mov_b32 s8, s0 365; SI-NEXT: s_mov_b32 s9, s1 366; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 367; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 368; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 369; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 370; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc 371; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 372; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 373; SI-NEXT: s_endpgm 374; 375; VI-LABEL: select_f16_imm_d: 376; VI: ; %bb.0: ; %entry 377; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 378; VI-NEXT: s_mov_b32 s11, 0xf000 379; VI-NEXT: s_mov_b32 s10, -1 380; VI-NEXT: s_mov_b32 s14, s10 381; VI-NEXT: s_mov_b32 s15, s11 382; VI-NEXT: s_waitcnt lgkmcnt(0) 383; VI-NEXT: s_mov_b32 s12, s2 384; VI-NEXT: s_mov_b32 s13, s3 385; VI-NEXT: s_mov_b32 s16, s4 386; VI-NEXT: s_mov_b32 s17, s5 387; VI-NEXT: s_mov_b32 s18, s10 388; VI-NEXT: s_mov_b32 s19, s11 389; VI-NEXT: s_mov_b32 s4, s6 390; VI-NEXT: s_mov_b32 s5, s7 391; VI-NEXT: s_mov_b32 s6, s10 392; VI-NEXT: s_mov_b32 s7, s11 393; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 394; VI-NEXT: s_waitcnt vmcnt(0) 395; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc 396; VI-NEXT: s_waitcnt vmcnt(0) 397; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc 398; VI-NEXT: s_waitcnt vmcnt(0) 399; VI-NEXT: v_mov_b32_e32 v3, 0x3800 400; VI-NEXT: s_mov_b32 s8, s0 401; VI-NEXT: s_mov_b32 s9, s1 402; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 403; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 404; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 405; VI-NEXT: s_endpgm 406 half addrspace(1)* %r, 407 half addrspace(1)* %a, 408 half addrspace(1)* %b, 409 half addrspace(1)* %c) { 410entry: 411 %a.val = load volatile half, half addrspace(1)* %a 412 %b.val = load volatile half, half addrspace(1)* %b 413 %c.val = load volatile half, half addrspace(1)* %c 414 %fcmp = fcmp olt half %a.val, %b.val 415 %r.val = select i1 %fcmp, half %c.val, half 0xH3800 416 store half %r.val, half addrspace(1)* %r 417 ret void 418} 419 420define amdgpu_kernel void @select_v2f16( 421; SI-LABEL: select_v2f16: 422; SI: ; %bb.0: ; %entry 423; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 424; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 425; SI-NEXT: s_mov_b32 s3, 0xf000 426; SI-NEXT: s_mov_b32 s2, -1 427; SI-NEXT: s_mov_b32 s18, s2 428; SI-NEXT: s_waitcnt lgkmcnt(0) 429; SI-NEXT: s_mov_b32 s16, s6 430; SI-NEXT: s_mov_b32 s17, s7 431; SI-NEXT: s_mov_b32 s19, s3 432; SI-NEXT: s_mov_b32 s20, s8 433; SI-NEXT: s_mov_b32 s21, s9 434; SI-NEXT: s_mov_b32 s22, s2 435; SI-NEXT: s_mov_b32 s23, s3 436; SI-NEXT: s_mov_b32 s14, s2 437; SI-NEXT: s_mov_b32 s15, s3 438; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 439; SI-NEXT: s_mov_b32 s8, s10 440; SI-NEXT: s_mov_b32 s9, s11 441; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 442; SI-NEXT: s_mov_b32 s10, s2 443; SI-NEXT: s_mov_b32 s11, s3 444; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0 445; SI-NEXT: buffer_load_dword v3, off, s[8:11], 0 446; SI-NEXT: s_mov_b32 s0, s4 447; SI-NEXT: s_mov_b32 s1, s5 448; SI-NEXT: s_waitcnt vmcnt(3) 449; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 450; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 451; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 452; SI-NEXT: s_waitcnt vmcnt(2) 453; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 454; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 455; SI-NEXT: s_waitcnt vmcnt(1) 456; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 457; SI-NEXT: s_waitcnt vmcnt(0) 458; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 459; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 460; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 461; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 462; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 463; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 464; SI-NEXT: v_cmp_lt_f32_e32 vcc, v5, v6 465; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc 466; SI-NEXT: v_cmp_lt_f32_e32 vcc, v1, v2 467; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 468; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 469; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 470; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 471; SI-NEXT: v_or_b32_e32 v0, v1, v0 472; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 473; SI-NEXT: s_endpgm 474; 475; VI-LABEL: select_v2f16: 476; VI: ; %bb.0: ; %entry 477; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 478; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 479; VI-NEXT: s_mov_b32 s3, 0xf000 480; VI-NEXT: s_mov_b32 s2, -1 481; VI-NEXT: s_mov_b32 s14, s2 482; VI-NEXT: s_mov_b32 s15, s3 483; VI-NEXT: s_waitcnt lgkmcnt(0) 484; VI-NEXT: s_mov_b32 s16, s6 485; VI-NEXT: s_mov_b32 s17, s7 486; VI-NEXT: s_mov_b32 s18, s2 487; VI-NEXT: s_mov_b32 s19, s3 488; VI-NEXT: s_mov_b32 s20, s8 489; VI-NEXT: s_mov_b32 s21, s9 490; VI-NEXT: s_mov_b32 s22, s2 491; VI-NEXT: s_mov_b32 s23, s3 492; VI-NEXT: s_mov_b32 s8, s10 493; VI-NEXT: s_mov_b32 s9, s11 494; VI-NEXT: s_mov_b32 s10, s2 495; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 496; VI-NEXT: buffer_load_dword v1, off, s[20:23], 0 497; VI-NEXT: buffer_load_dword v2, off, s[16:19], 0 498; VI-NEXT: s_mov_b32 s11, s3 499; VI-NEXT: buffer_load_dword v3, off, s[8:11], 0 500; VI-NEXT: s_mov_b32 s0, s4 501; VI-NEXT: s_mov_b32 s1, s5 502; VI-NEXT: s_waitcnt vmcnt(3) 503; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 504; VI-NEXT: s_waitcnt vmcnt(2) 505; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 506; VI-NEXT: s_waitcnt vmcnt(1) 507; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 508; VI-NEXT: v_cmp_lt_f16_e32 vcc, v2, v1 509; VI-NEXT: s_waitcnt vmcnt(0) 510; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 511; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 512; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5 513; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc 514; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 515; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 516; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 517; VI-NEXT: s_endpgm 518 <2 x half> addrspace(1)* %r, 519 <2 x half> addrspace(1)* %a, 520 <2 x half> addrspace(1)* %b, 521 <2 x half> addrspace(1)* %c, 522 <2 x half> addrspace(1)* %d) { 523entry: 524 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 525 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 526 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 527 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 528 %fcmp = fcmp olt <2 x half> %a.val, %b.val 529 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val 530 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 531 ret void 532} 533 534define amdgpu_kernel void @select_v2f16_imm_a( 535; SI-LABEL: select_v2f16_imm_a: 536; SI: ; %bb.0: ; %entry 537; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 538; SI-NEXT: s_mov_b32 s11, 0xf000 539; SI-NEXT: s_mov_b32 s10, -1 540; SI-NEXT: s_mov_b32 s14, s10 541; SI-NEXT: s_mov_b32 s15, s11 542; SI-NEXT: s_waitcnt lgkmcnt(0) 543; SI-NEXT: s_mov_b32 s12, s2 544; SI-NEXT: s_mov_b32 s13, s3 545; SI-NEXT: s_mov_b32 s16, s4 546; SI-NEXT: s_mov_b32 s17, s5 547; SI-NEXT: s_mov_b32 s18, s10 548; SI-NEXT: s_mov_b32 s19, s11 549; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 550; SI-NEXT: s_mov_b32 s4, s6 551; SI-NEXT: s_mov_b32 s5, s7 552; SI-NEXT: s_mov_b32 s6, s10 553; SI-NEXT: s_mov_b32 s7, s11 554; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 555; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 556; SI-NEXT: s_mov_b32 s2, 0x3f200000 557; SI-NEXT: s_mov_b32 s8, s0 558; SI-NEXT: s_mov_b32 s9, s1 559; SI-NEXT: s_waitcnt vmcnt(2) 560; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 561; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 562; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 563; SI-NEXT: s_waitcnt vmcnt(1) 564; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 565; SI-NEXT: s_waitcnt vmcnt(0) 566; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 567; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 568; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 569; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 570; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 571; SI-NEXT: v_cmp_lt_f32_e32 vcc, s2, v3 572; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc 573; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0 574; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 575; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 576; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 577; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 578; SI-NEXT: v_or_b32_e32 v0, v0, v1 579; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 580; SI-NEXT: s_endpgm 581; 582; VI-LABEL: select_v2f16_imm_a: 583; VI: ; %bb.0: ; %entry 584; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 585; VI-NEXT: s_mov_b32 s11, 0xf000 586; VI-NEXT: s_mov_b32 s10, -1 587; VI-NEXT: s_mov_b32 s14, s10 588; VI-NEXT: s_mov_b32 s15, s11 589; VI-NEXT: s_waitcnt lgkmcnt(0) 590; VI-NEXT: s_mov_b32 s12, s2 591; VI-NEXT: s_mov_b32 s13, s3 592; VI-NEXT: s_mov_b32 s16, s4 593; VI-NEXT: s_mov_b32 s17, s5 594; VI-NEXT: s_mov_b32 s18, s10 595; VI-NEXT: s_mov_b32 s19, s11 596; VI-NEXT: s_mov_b32 s4, s6 597; VI-NEXT: s_mov_b32 s5, s7 598; VI-NEXT: s_mov_b32 s6, s10 599; VI-NEXT: s_mov_b32 s7, s11 600; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 601; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 602; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 603; VI-NEXT: s_movk_i32 s2, 0x3900 604; VI-NEXT: s_mov_b32 s8, s0 605; VI-NEXT: s_mov_b32 s9, s1 606; VI-NEXT: s_waitcnt vmcnt(2) 607; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 608; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0 609; VI-NEXT: s_waitcnt vmcnt(0) 610; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 611; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 612; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 613; VI-NEXT: v_cmp_lt_f16_e32 vcc, s2, v3 614; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 615; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 616; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 617; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 618; VI-NEXT: s_endpgm 619 <2 x half> addrspace(1)* %r, 620 <2 x half> addrspace(1)* %b, 621 <2 x half> addrspace(1)* %c, 622 <2 x half> addrspace(1)* %d) { 623entry: 624 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 625 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 626 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 627 %fcmp = fcmp olt <2 x half> <half 0xH3800, half 0xH3900>, %b.val 628 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val 629 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 630 ret void 631} 632 633define amdgpu_kernel void @select_v2f16_imm_b( 634; SI-LABEL: select_v2f16_imm_b: 635; SI: ; %bb.0: ; %entry 636; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 637; SI-NEXT: s_mov_b32 s11, 0xf000 638; SI-NEXT: s_mov_b32 s10, -1 639; SI-NEXT: s_mov_b32 s14, s10 640; SI-NEXT: s_mov_b32 s15, s11 641; SI-NEXT: s_waitcnt lgkmcnt(0) 642; SI-NEXT: s_mov_b32 s12, s2 643; SI-NEXT: s_mov_b32 s13, s3 644; SI-NEXT: s_mov_b32 s16, s4 645; SI-NEXT: s_mov_b32 s17, s5 646; SI-NEXT: s_mov_b32 s18, s10 647; SI-NEXT: s_mov_b32 s19, s11 648; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 649; SI-NEXT: s_mov_b32 s4, s6 650; SI-NEXT: s_mov_b32 s5, s7 651; SI-NEXT: s_mov_b32 s6, s10 652; SI-NEXT: s_mov_b32 s7, s11 653; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 654; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 655; SI-NEXT: s_mov_b32 s2, 0x3f200000 656; SI-NEXT: s_mov_b32 s8, s0 657; SI-NEXT: s_mov_b32 s9, s1 658; SI-NEXT: s_waitcnt vmcnt(2) 659; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 660; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 661; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 662; SI-NEXT: s_waitcnt vmcnt(1) 663; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 664; SI-NEXT: s_waitcnt vmcnt(0) 665; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 666; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 667; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 668; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 669; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 670; SI-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3 671; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc 672; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0 673; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 674; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 675; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 676; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 677; SI-NEXT: v_or_b32_e32 v0, v0, v1 678; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 679; SI-NEXT: s_endpgm 680; 681; VI-LABEL: select_v2f16_imm_b: 682; VI: ; %bb.0: ; %entry 683; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 684; VI-NEXT: s_mov_b32 s11, 0xf000 685; VI-NEXT: s_mov_b32 s10, -1 686; VI-NEXT: s_mov_b32 s14, s10 687; VI-NEXT: s_mov_b32 s15, s11 688; VI-NEXT: s_waitcnt lgkmcnt(0) 689; VI-NEXT: s_mov_b32 s12, s2 690; VI-NEXT: s_mov_b32 s13, s3 691; VI-NEXT: s_mov_b32 s16, s4 692; VI-NEXT: s_mov_b32 s17, s5 693; VI-NEXT: s_mov_b32 s18, s10 694; VI-NEXT: s_mov_b32 s19, s11 695; VI-NEXT: s_mov_b32 s4, s6 696; VI-NEXT: s_mov_b32 s5, s7 697; VI-NEXT: s_mov_b32 s6, s10 698; VI-NEXT: s_mov_b32 s7, s11 699; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 700; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 701; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 702; VI-NEXT: s_movk_i32 s2, 0x3900 703; VI-NEXT: s_mov_b32 s8, s0 704; VI-NEXT: s_mov_b32 s9, s1 705; VI-NEXT: s_waitcnt vmcnt(2) 706; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 707; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0 708; VI-NEXT: s_waitcnt vmcnt(0) 709; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 710; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 711; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 712; VI-NEXT: v_cmp_gt_f16_e32 vcc, s2, v3 713; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 714; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 715; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 716; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 717; VI-NEXT: s_endpgm 718 <2 x half> addrspace(1)* %r, 719 <2 x half> addrspace(1)* %a, 720 <2 x half> addrspace(1)* %c, 721 <2 x half> addrspace(1)* %d) { 722entry: 723 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 724 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 725 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 726 %fcmp = fcmp olt <2 x half> %a.val, <half 0xH3800, half 0xH3900> 727 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val 728 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 729 ret void 730} 731 732define amdgpu_kernel void @select_v2f16_imm_c( 733; SI-LABEL: select_v2f16_imm_c: 734; SI: ; %bb.0: ; %entry 735; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 736; SI-NEXT: s_mov_b32 s11, 0xf000 737; SI-NEXT: s_mov_b32 s10, -1 738; SI-NEXT: s_mov_b32 s14, s10 739; SI-NEXT: s_mov_b32 s15, s11 740; SI-NEXT: s_waitcnt lgkmcnt(0) 741; SI-NEXT: s_mov_b32 s12, s2 742; SI-NEXT: s_mov_b32 s13, s3 743; SI-NEXT: s_mov_b32 s16, s4 744; SI-NEXT: s_mov_b32 s17, s5 745; SI-NEXT: s_mov_b32 s18, s10 746; SI-NEXT: s_mov_b32 s19, s11 747; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 748; SI-NEXT: s_mov_b32 s4, s6 749; SI-NEXT: s_mov_b32 s5, s7 750; SI-NEXT: s_mov_b32 s6, s10 751; SI-NEXT: s_mov_b32 s7, s11 752; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 753; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 754; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000 755; SI-NEXT: s_mov_b32 s8, s0 756; SI-NEXT: s_mov_b32 s9, s1 757; SI-NEXT: s_waitcnt vmcnt(2) 758; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 759; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 760; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 761; SI-NEXT: s_waitcnt vmcnt(1) 762; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 763; SI-NEXT: s_waitcnt vmcnt(0) 764; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 765; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 766; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 767; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 768; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 769; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v5 770; SI-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc 771; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v1 772; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 773; SI-NEXT: v_cndmask_b32_e32 v1, 0.5, v2, vcc 774; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 775; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 776; SI-NEXT: v_or_b32_e32 v0, v1, v0 777; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 778; SI-NEXT: s_endpgm 779; 780; VI-LABEL: select_v2f16_imm_c: 781; VI: ; %bb.0: ; %entry 782; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 783; VI-NEXT: s_mov_b32 s11, 0xf000 784; VI-NEXT: s_mov_b32 s10, -1 785; VI-NEXT: s_mov_b32 s18, s10 786; VI-NEXT: s_mov_b32 s19, s11 787; VI-NEXT: s_waitcnt lgkmcnt(0) 788; VI-NEXT: s_mov_b32 s16, s4 789; VI-NEXT: s_mov_b32 s17, s5 790; VI-NEXT: s_mov_b32 s14, s10 791; VI-NEXT: s_mov_b32 s12, s2 792; VI-NEXT: s_mov_b32 s13, s3 793; VI-NEXT: s_mov_b32 s15, s11 794; VI-NEXT: s_mov_b32 s4, s6 795; VI-NEXT: s_mov_b32 s5, s7 796; VI-NEXT: s_mov_b32 s6, s10 797; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 798; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 799; VI-NEXT: s_mov_b32 s7, s11 800; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 801; VI-NEXT: v_mov_b32_e32 v3, 0x3800 802; VI-NEXT: v_mov_b32_e32 v4, 0x3900 803; VI-NEXT: s_mov_b32 s8, s0 804; VI-NEXT: s_mov_b32 s9, s1 805; VI-NEXT: s_waitcnt vmcnt(2) 806; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 807; VI-NEXT: s_waitcnt vmcnt(1) 808; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 809; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v0 810; VI-NEXT: s_waitcnt vmcnt(0) 811; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 812; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 813; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v6, v5 814; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc 815; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 816; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 817; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 818; VI-NEXT: s_endpgm 819 <2 x half> addrspace(1)* %r, 820 <2 x half> addrspace(1)* %a, 821 <2 x half> addrspace(1)* %b, 822 <2 x half> addrspace(1)* %d) { 823entry: 824 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 825 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 826 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 827 %fcmp = fcmp olt <2 x half> %a.val, %b.val 828 %r.val = select <2 x i1> %fcmp, <2 x half> <half 0xH3800, half 0xH3900>, <2 x half> %d.val 829 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 830 ret void 831} 832 833define amdgpu_kernel void @select_v2f16_imm_d( 834; SI-LABEL: select_v2f16_imm_d: 835; SI: ; %bb.0: ; %entry 836; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 837; SI-NEXT: s_mov_b32 s11, 0xf000 838; SI-NEXT: s_mov_b32 s10, -1 839; SI-NEXT: s_mov_b32 s14, s10 840; SI-NEXT: s_mov_b32 s15, s11 841; SI-NEXT: s_waitcnt lgkmcnt(0) 842; SI-NEXT: s_mov_b32 s12, s2 843; SI-NEXT: s_mov_b32 s13, s3 844; SI-NEXT: s_mov_b32 s16, s4 845; SI-NEXT: s_mov_b32 s17, s5 846; SI-NEXT: s_mov_b32 s18, s10 847; SI-NEXT: s_mov_b32 s19, s11 848; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 849; SI-NEXT: s_mov_b32 s4, s6 850; SI-NEXT: s_mov_b32 s5, s7 851; SI-NEXT: s_mov_b32 s6, s10 852; SI-NEXT: s_mov_b32 s7, s11 853; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 854; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 855; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000 856; SI-NEXT: s_mov_b32 s8, s0 857; SI-NEXT: s_mov_b32 s9, s1 858; SI-NEXT: s_waitcnt vmcnt(2) 859; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 860; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 861; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 862; SI-NEXT: s_waitcnt vmcnt(1) 863; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 864; SI-NEXT: s_waitcnt vmcnt(0) 865; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 866; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 867; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 868; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 869; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 870; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5 871; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 872; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 873; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 874; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc 875; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 876; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 877; SI-NEXT: v_or_b32_e32 v0, v0, v1 878; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 879; SI-NEXT: s_endpgm 880; 881; VI-LABEL: select_v2f16_imm_d: 882; VI: ; %bb.0: ; %entry 883; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 884; VI-NEXT: s_mov_b32 s11, 0xf000 885; VI-NEXT: s_mov_b32 s10, -1 886; VI-NEXT: s_mov_b32 s18, s10 887; VI-NEXT: s_mov_b32 s19, s11 888; VI-NEXT: s_waitcnt lgkmcnt(0) 889; VI-NEXT: s_mov_b32 s16, s4 890; VI-NEXT: s_mov_b32 s17, s5 891; VI-NEXT: s_mov_b32 s14, s10 892; VI-NEXT: s_mov_b32 s12, s2 893; VI-NEXT: s_mov_b32 s13, s3 894; VI-NEXT: s_mov_b32 s15, s11 895; VI-NEXT: s_mov_b32 s4, s6 896; VI-NEXT: s_mov_b32 s5, s7 897; VI-NEXT: s_mov_b32 s6, s10 898; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 899; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 900; VI-NEXT: s_mov_b32 s7, s11 901; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 902; VI-NEXT: v_mov_b32_e32 v3, 0x3800 903; VI-NEXT: v_mov_b32_e32 v4, 0x3900 904; VI-NEXT: s_mov_b32 s8, s0 905; VI-NEXT: s_mov_b32 s9, s1 906; VI-NEXT: s_waitcnt vmcnt(2) 907; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 908; VI-NEXT: s_waitcnt vmcnt(1) 909; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 910; VI-NEXT: v_cmp_lt_f16_e32 vcc, v1, v0 911; VI-NEXT: s_waitcnt vmcnt(0) 912; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 913; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 914; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5 915; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc 916; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 917; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 918; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 919; VI-NEXT: s_endpgm 920 <2 x half> addrspace(1)* %r, 921 <2 x half> addrspace(1)* %a, 922 <2 x half> addrspace(1)* %b, 923 <2 x half> addrspace(1)* %c) { 924entry: 925 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 926 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 927 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 928 %fcmp = fcmp olt <2 x half> %a.val, %b.val 929 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> <half 0xH3800, half 0xH3900> 930 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 931 ret void 932} 933