1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,SI 3; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,VI 4 5define amdgpu_kernel void @select_f16( 6; SI-LABEL: select_f16: 7; SI: ; %bb.0: ; %entry 8; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 9; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 10; SI-NEXT: s_mov_b32 s3, 0xf000 11; SI-NEXT: s_mov_b32 s2, -1 12; SI-NEXT: s_mov_b32 s18, s2 13; SI-NEXT: s_waitcnt lgkmcnt(0) 14; SI-NEXT: s_mov_b32 s16, s6 15; SI-NEXT: s_mov_b32 s17, s7 16; SI-NEXT: s_mov_b32 s19, s3 17; SI-NEXT: s_mov_b32 s20, s8 18; SI-NEXT: s_mov_b32 s21, s9 19; SI-NEXT: s_mov_b32 s8, s10 20; SI-NEXT: s_mov_b32 s9, s11 21; SI-NEXT: s_mov_b32 s22, s2 22; SI-NEXT: s_mov_b32 s23, s3 23; SI-NEXT: s_mov_b32 s10, s2 24; SI-NEXT: s_mov_b32 s11, s3 25; SI-NEXT: s_mov_b32 s14, s2 26; SI-NEXT: s_mov_b32 s15, s3 27; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 28; SI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 29; SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 30; SI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 31; SI-NEXT: s_mov_b32 s0, s4 32; SI-NEXT: s_mov_b32 s1, s5 33; SI-NEXT: s_waitcnt vmcnt(3) 34; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 35; SI-NEXT: s_waitcnt vmcnt(2) 36; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 37; SI-NEXT: s_waitcnt vmcnt(1) 38; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 39; SI-NEXT: s_waitcnt vmcnt(0) 40; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 41; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 42; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 43; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 44; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 45; SI-NEXT: s_endpgm 46; 47; VI-LABEL: select_f16: 48; VI: ; %bb.0: ; %entry 49; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 50; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 51; VI-NEXT: s_mov_b32 s3, 0xf000 52; VI-NEXT: s_mov_b32 s2, -1 53; VI-NEXT: s_mov_b32 s18, s2 54; VI-NEXT: s_waitcnt lgkmcnt(0) 55; VI-NEXT: s_mov_b32 s0, s4 56; VI-NEXT: s_mov_b32 s1, s5 57; VI-NEXT: s_mov_b32 s4, s6 58; VI-NEXT: s_mov_b32 s5, s7 59; VI-NEXT: s_mov_b32 s6, s2 60; VI-NEXT: s_mov_b32 s7, s3 61; VI-NEXT: s_mov_b32 s16, s8 62; VI-NEXT: s_mov_b32 s17, s9 63; VI-NEXT: s_mov_b32 s8, s10 64; VI-NEXT: s_mov_b32 s9, s11 65; VI-NEXT: s_mov_b32 s19, s3 66; VI-NEXT: s_mov_b32 s10, s2 67; VI-NEXT: s_mov_b32 s11, s3 68; VI-NEXT: s_mov_b32 s14, s2 69; VI-NEXT: s_mov_b32 s15, s3 70; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 71; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 72; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 73; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 74; VI-NEXT: s_waitcnt vmcnt(2) 75; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 76; VI-NEXT: s_waitcnt vmcnt(0) 77; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 78; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 79; VI-NEXT: s_endpgm 80 half addrspace(1)* %r, 81 half addrspace(1)* %a, 82 half addrspace(1)* %b, 83 half addrspace(1)* %c, 84 half addrspace(1)* %d) { 85entry: 86 %a.val = load volatile half, half addrspace(1)* %a 87 %b.val = load volatile half, half addrspace(1)* %b 88 %c.val = load volatile half, half addrspace(1)* %c 89 %d.val = load volatile half, half addrspace(1)* %d 90 %fcmp = fcmp olt half %a.val, %b.val 91 %r.val = select i1 %fcmp, half %c.val, half %d.val 92 store half %r.val, half addrspace(1)* %r 93 ret void 94} 95 96define amdgpu_kernel void @select_f16_imm_a( 97; SI-LABEL: select_f16_imm_a: 98; SI: ; %bb.0: ; %entry 99; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 100; SI-NEXT: s_mov_b32 s11, 0xf000 101; SI-NEXT: s_mov_b32 s10, -1 102; SI-NEXT: s_mov_b32 s14, s10 103; SI-NEXT: s_mov_b32 s15, s11 104; SI-NEXT: s_waitcnt lgkmcnt(0) 105; SI-NEXT: s_mov_b32 s12, s2 106; SI-NEXT: s_mov_b32 s13, s3 107; SI-NEXT: s_mov_b32 s16, s4 108; SI-NEXT: s_mov_b32 s17, s5 109; SI-NEXT: s_mov_b32 s4, s6 110; SI-NEXT: s_mov_b32 s5, s7 111; SI-NEXT: s_mov_b32 s18, s10 112; SI-NEXT: s_mov_b32 s19, s11 113; SI-NEXT: s_mov_b32 s6, s10 114; SI-NEXT: s_mov_b32 s7, s11 115; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 116; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 117; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 118; SI-NEXT: s_mov_b32 s8, s0 119; SI-NEXT: s_mov_b32 s9, s1 120; SI-NEXT: s_waitcnt vmcnt(2) 121; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 122; SI-NEXT: s_waitcnt vmcnt(1) 123; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 124; SI-NEXT: s_waitcnt vmcnt(0) 125; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 126; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0 127; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 128; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 129; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 130; SI-NEXT: s_endpgm 131; 132; VI-LABEL: select_f16_imm_a: 133; VI: ; %bb.0: ; %entry 134; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 135; VI-NEXT: s_mov_b32 s11, 0xf000 136; VI-NEXT: s_mov_b32 s10, -1 137; VI-NEXT: s_mov_b32 s14, s10 138; VI-NEXT: s_mov_b32 s15, s11 139; VI-NEXT: s_waitcnt lgkmcnt(0) 140; VI-NEXT: s_mov_b32 s8, s0 141; VI-NEXT: s_mov_b32 s9, s1 142; VI-NEXT: s_mov_b32 s0, s2 143; VI-NEXT: s_mov_b32 s1, s3 144; VI-NEXT: s_mov_b32 s2, s10 145; VI-NEXT: s_mov_b32 s3, s11 146; VI-NEXT: s_mov_b32 s12, s4 147; VI-NEXT: s_mov_b32 s13, s5 148; VI-NEXT: s_mov_b32 s4, s6 149; VI-NEXT: s_mov_b32 s5, s7 150; VI-NEXT: s_mov_b32 s6, s10 151; VI-NEXT: s_mov_b32 s7, s11 152; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 153; VI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 154; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 155; VI-NEXT: s_waitcnt vmcnt(2) 156; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0 157; VI-NEXT: s_waitcnt vmcnt(0) 158; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 159; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 160; VI-NEXT: s_endpgm 161 half addrspace(1)* %r, 162 half addrspace(1)* %b, 163 half addrspace(1)* %c, 164 half addrspace(1)* %d) { 165entry: 166 %b.val = load volatile half, half addrspace(1)* %b 167 %c.val = load volatile half, half addrspace(1)* %c 168 %d.val = load volatile half, half addrspace(1)* %d 169 %fcmp = fcmp olt half 0xH3800, %b.val 170 %r.val = select i1 %fcmp, half %c.val, half %d.val 171 store half %r.val, half addrspace(1)* %r 172 ret void 173} 174 175define amdgpu_kernel void @select_f16_imm_b( 176; SI-LABEL: select_f16_imm_b: 177; SI: ; %bb.0: ; %entry 178; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 179; SI-NEXT: s_mov_b32 s11, 0xf000 180; SI-NEXT: s_mov_b32 s10, -1 181; SI-NEXT: s_mov_b32 s14, s10 182; SI-NEXT: s_mov_b32 s15, s11 183; SI-NEXT: s_waitcnt lgkmcnt(0) 184; SI-NEXT: s_mov_b32 s12, s2 185; SI-NEXT: s_mov_b32 s13, s3 186; SI-NEXT: s_mov_b32 s16, s4 187; SI-NEXT: s_mov_b32 s17, s5 188; SI-NEXT: s_mov_b32 s4, s6 189; SI-NEXT: s_mov_b32 s5, s7 190; SI-NEXT: s_mov_b32 s18, s10 191; SI-NEXT: s_mov_b32 s19, s11 192; SI-NEXT: s_mov_b32 s6, s10 193; SI-NEXT: s_mov_b32 s7, s11 194; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 195; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 196; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 197; SI-NEXT: s_mov_b32 s8, s0 198; SI-NEXT: s_mov_b32 s9, s1 199; SI-NEXT: s_waitcnt vmcnt(2) 200; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 201; SI-NEXT: s_waitcnt vmcnt(1) 202; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 203; SI-NEXT: s_waitcnt vmcnt(0) 204; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 205; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0 206; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 207; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 208; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 209; SI-NEXT: s_endpgm 210; 211; VI-LABEL: select_f16_imm_b: 212; VI: ; %bb.0: ; %entry 213; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 214; VI-NEXT: s_mov_b32 s11, 0xf000 215; VI-NEXT: s_mov_b32 s10, -1 216; VI-NEXT: s_mov_b32 s14, s10 217; VI-NEXT: s_mov_b32 s15, s11 218; VI-NEXT: s_waitcnt lgkmcnt(0) 219; VI-NEXT: s_mov_b32 s8, s0 220; VI-NEXT: s_mov_b32 s9, s1 221; VI-NEXT: s_mov_b32 s0, s2 222; VI-NEXT: s_mov_b32 s1, s3 223; VI-NEXT: s_mov_b32 s2, s10 224; VI-NEXT: s_mov_b32 s3, s11 225; VI-NEXT: s_mov_b32 s12, s4 226; VI-NEXT: s_mov_b32 s13, s5 227; VI-NEXT: s_mov_b32 s4, s6 228; VI-NEXT: s_mov_b32 s5, s7 229; VI-NEXT: s_mov_b32 s6, s10 230; VI-NEXT: s_mov_b32 s7, s11 231; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 232; VI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 233; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 234; VI-NEXT: s_waitcnt vmcnt(2) 235; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0 236; VI-NEXT: s_waitcnt vmcnt(0) 237; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 238; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 239; VI-NEXT: s_endpgm 240 half addrspace(1)* %r, 241 half addrspace(1)* %a, 242 half addrspace(1)* %c, 243 half addrspace(1)* %d) { 244entry: 245 %a.val = load volatile half, half addrspace(1)* %a 246 %c.val = load volatile half, half addrspace(1)* %c 247 %d.val = load volatile half, half addrspace(1)* %d 248 %fcmp = fcmp olt half %a.val, 0xH3800 249 %r.val = select i1 %fcmp, half %c.val, half %d.val 250 store half %r.val, half addrspace(1)* %r 251 ret void 252} 253 254define amdgpu_kernel void @select_f16_imm_c( 255; SI-LABEL: select_f16_imm_c: 256; SI: ; %bb.0: ; %entry 257; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 258; SI-NEXT: s_mov_b32 s11, 0xf000 259; SI-NEXT: s_mov_b32 s10, -1 260; SI-NEXT: s_mov_b32 s14, s10 261; SI-NEXT: s_mov_b32 s15, s11 262; SI-NEXT: s_waitcnt lgkmcnt(0) 263; SI-NEXT: s_mov_b32 s12, s2 264; SI-NEXT: s_mov_b32 s13, s3 265; SI-NEXT: s_mov_b32 s16, s4 266; SI-NEXT: s_mov_b32 s17, s5 267; SI-NEXT: s_mov_b32 s4, s6 268; SI-NEXT: s_mov_b32 s5, s7 269; SI-NEXT: s_mov_b32 s18, s10 270; SI-NEXT: s_mov_b32 s19, s11 271; SI-NEXT: s_mov_b32 s6, s10 272; SI-NEXT: s_mov_b32 s7, s11 273; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 274; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 275; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 276; SI-NEXT: s_mov_b32 s8, s0 277; SI-NEXT: s_mov_b32 s9, s1 278; SI-NEXT: s_waitcnt vmcnt(2) 279; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 280; SI-NEXT: s_waitcnt vmcnt(1) 281; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 282; SI-NEXT: s_waitcnt vmcnt(0) 283; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 284; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 285; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc 286; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 287; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 288; SI-NEXT: s_endpgm 289; 290; VI-LABEL: select_f16_imm_c: 291; VI: ; %bb.0: ; %entry 292; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 293; VI-NEXT: s_mov_b32 s11, 0xf000 294; VI-NEXT: s_mov_b32 s10, -1 295; VI-NEXT: s_mov_b32 s14, s10 296; VI-NEXT: s_mov_b32 s15, s11 297; VI-NEXT: s_waitcnt lgkmcnt(0) 298; VI-NEXT: s_mov_b32 s8, s0 299; VI-NEXT: s_mov_b32 s9, s1 300; VI-NEXT: s_mov_b32 s0, s2 301; VI-NEXT: s_mov_b32 s1, s3 302; VI-NEXT: s_mov_b32 s2, s10 303; VI-NEXT: s_mov_b32 s3, s11 304; VI-NEXT: s_mov_b32 s12, s4 305; VI-NEXT: s_mov_b32 s13, s5 306; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 307; VI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 308; VI-NEXT: s_mov_b32 s4, s6 309; VI-NEXT: s_mov_b32 s5, s7 310; VI-NEXT: s_mov_b32 s6, s10 311; VI-NEXT: s_mov_b32 s7, s11 312; VI-NEXT: buffer_load_ushort v3, off, s[4:7], 0 313; VI-NEXT: v_mov_b32_e32 v2, 0x3800 314; VI-NEXT: s_waitcnt vmcnt(1) 315; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 316; VI-NEXT: s_waitcnt vmcnt(0) 317; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 318; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 319; VI-NEXT: s_endpgm 320 half addrspace(1)* %r, 321 half addrspace(1)* %a, 322 half addrspace(1)* %b, 323 half addrspace(1)* %d) { 324entry: 325 %a.val = load volatile half, half addrspace(1)* %a 326 %b.val = load volatile half, half addrspace(1)* %b 327 %d.val = load volatile half, half addrspace(1)* %d 328 %fcmp = fcmp olt half %a.val, %b.val 329 %r.val = select i1 %fcmp, half 0xH3800, half %d.val 330 store half %r.val, half addrspace(1)* %r 331 ret void 332} 333 334define amdgpu_kernel void @select_f16_imm_d( 335; SI-LABEL: select_f16_imm_d: 336; SI: ; %bb.0: ; %entry 337; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 338; SI-NEXT: s_mov_b32 s11, 0xf000 339; SI-NEXT: s_mov_b32 s10, -1 340; SI-NEXT: s_mov_b32 s14, s10 341; SI-NEXT: s_mov_b32 s15, s11 342; SI-NEXT: s_waitcnt lgkmcnt(0) 343; SI-NEXT: s_mov_b32 s12, s2 344; SI-NEXT: s_mov_b32 s13, s3 345; SI-NEXT: s_mov_b32 s16, s4 346; SI-NEXT: s_mov_b32 s17, s5 347; SI-NEXT: s_mov_b32 s4, s6 348; SI-NEXT: s_mov_b32 s5, s7 349; SI-NEXT: s_mov_b32 s18, s10 350; SI-NEXT: s_mov_b32 s19, s11 351; SI-NEXT: s_mov_b32 s6, s10 352; SI-NEXT: s_mov_b32 s7, s11 353; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 354; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 355; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 356; SI-NEXT: s_mov_b32 s8, s0 357; SI-NEXT: s_mov_b32 s9, s1 358; SI-NEXT: s_waitcnt vmcnt(2) 359; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 360; SI-NEXT: s_waitcnt vmcnt(1) 361; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 362; SI-NEXT: s_waitcnt vmcnt(0) 363; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 364; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 365; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc 366; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 367; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 368; SI-NEXT: s_endpgm 369; 370; VI-LABEL: select_f16_imm_d: 371; VI: ; %bb.0: ; %entry 372; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 373; VI-NEXT: s_mov_b32 s11, 0xf000 374; VI-NEXT: s_mov_b32 s10, -1 375; VI-NEXT: s_mov_b32 s14, s10 376; VI-NEXT: s_mov_b32 s15, s11 377; VI-NEXT: s_waitcnt lgkmcnt(0) 378; VI-NEXT: s_mov_b32 s8, s0 379; VI-NEXT: s_mov_b32 s9, s1 380; VI-NEXT: s_mov_b32 s0, s2 381; VI-NEXT: s_mov_b32 s1, s3 382; VI-NEXT: s_mov_b32 s2, s10 383; VI-NEXT: s_mov_b32 s3, s11 384; VI-NEXT: s_mov_b32 s12, s4 385; VI-NEXT: s_mov_b32 s13, s5 386; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 387; VI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 388; VI-NEXT: s_mov_b32 s4, s6 389; VI-NEXT: s_mov_b32 s5, s7 390; VI-NEXT: s_mov_b32 s6, s10 391; VI-NEXT: s_mov_b32 s7, s11 392; VI-NEXT: buffer_load_ushort v3, off, s[4:7], 0 393; VI-NEXT: v_mov_b32_e32 v2, 0x3800 394; VI-NEXT: s_waitcnt vmcnt(1) 395; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 396; VI-NEXT: s_waitcnt vmcnt(0) 397; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 398; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 399; VI-NEXT: s_endpgm 400 half addrspace(1)* %r, 401 half addrspace(1)* %a, 402 half addrspace(1)* %b, 403 half addrspace(1)* %c) { 404entry: 405 %a.val = load volatile half, half addrspace(1)* %a 406 %b.val = load volatile half, half addrspace(1)* %b 407 %c.val = load volatile half, half addrspace(1)* %c 408 %fcmp = fcmp olt half %a.val, %b.val 409 %r.val = select i1 %fcmp, half %c.val, half 0xH3800 410 store half %r.val, half addrspace(1)* %r 411 ret void 412} 413 414define amdgpu_kernel void @select_v2f16( 415; SI-LABEL: select_v2f16: 416; SI: ; %bb.0: ; %entry 417; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 418; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 419; SI-NEXT: s_mov_b32 s3, 0xf000 420; SI-NEXT: s_mov_b32 s2, -1 421; SI-NEXT: s_mov_b32 s18, s2 422; SI-NEXT: s_waitcnt lgkmcnt(0) 423; SI-NEXT: s_mov_b32 s16, s6 424; SI-NEXT: s_mov_b32 s17, s7 425; SI-NEXT: s_mov_b32 s19, s3 426; SI-NEXT: s_mov_b32 s20, s8 427; SI-NEXT: s_mov_b32 s21, s9 428; SI-NEXT: s_mov_b32 s8, s10 429; SI-NEXT: s_mov_b32 s9, s11 430; SI-NEXT: s_mov_b32 s22, s2 431; SI-NEXT: s_mov_b32 s23, s3 432; SI-NEXT: s_mov_b32 s10, s2 433; SI-NEXT: s_mov_b32 s11, s3 434; SI-NEXT: s_mov_b32 s14, s2 435; SI-NEXT: s_mov_b32 s15, s3 436; SI-NEXT: buffer_load_dword v0, off, s[16:19], 0 437; SI-NEXT: buffer_load_dword v1, off, s[20:23], 0 438; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 439; SI-NEXT: buffer_load_dword v3, off, s[8:11], 0 440; SI-NEXT: s_mov_b32 s0, s4 441; SI-NEXT: s_mov_b32 s1, s5 442; SI-NEXT: s_waitcnt vmcnt(3) 443; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 444; SI-NEXT: s_waitcnt vmcnt(2) 445; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 446; SI-NEXT: s_waitcnt vmcnt(1) 447; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 448; SI-NEXT: s_waitcnt vmcnt(0) 449; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 450; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 451; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 452; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 453; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 454; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 455; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 456; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 457; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 458; SI-NEXT: v_cmp_lt_f32_e32 vcc, v5, v6 459; SI-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc 460; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 461; SI-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc 462; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 463; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 464; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 465; SI-NEXT: v_or_b32_e32 v0, v0, v1 466; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 467; SI-NEXT: s_endpgm 468; 469; VI-LABEL: select_v2f16: 470; VI: ; %bb.0: ; %entry 471; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 472; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 473; VI-NEXT: s_mov_b32 s3, 0xf000 474; VI-NEXT: s_mov_b32 s2, -1 475; VI-NEXT: s_mov_b32 s18, s2 476; VI-NEXT: s_waitcnt lgkmcnt(0) 477; VI-NEXT: s_mov_b32 s0, s4 478; VI-NEXT: s_mov_b32 s1, s5 479; VI-NEXT: s_mov_b32 s4, s6 480; VI-NEXT: s_mov_b32 s5, s7 481; VI-NEXT: s_mov_b32 s6, s2 482; VI-NEXT: s_mov_b32 s7, s3 483; VI-NEXT: s_mov_b32 s16, s8 484; VI-NEXT: s_mov_b32 s17, s9 485; VI-NEXT: s_mov_b32 s8, s10 486; VI-NEXT: s_mov_b32 s9, s11 487; VI-NEXT: s_mov_b32 s19, s3 488; VI-NEXT: s_mov_b32 s10, s2 489; VI-NEXT: s_mov_b32 s11, s3 490; VI-NEXT: s_mov_b32 s14, s2 491; VI-NEXT: s_mov_b32 s15, s3 492; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 493; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 494; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0 495; VI-NEXT: buffer_load_dword v3, off, s[8:11], 0 496; VI-NEXT: s_waitcnt vmcnt(3) 497; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 498; VI-NEXT: s_waitcnt vmcnt(2) 499; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 500; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 501; VI-NEXT: s_waitcnt vmcnt(0) 502; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 503; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 504; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 505; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5 506; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc 507; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 508; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 509; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 510; VI-NEXT: s_endpgm 511 <2 x half> addrspace(1)* %r, 512 <2 x half> addrspace(1)* %a, 513 <2 x half> addrspace(1)* %b, 514 <2 x half> addrspace(1)* %c, 515 <2 x half> addrspace(1)* %d) { 516entry: 517 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 518 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 519 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 520 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 521 %fcmp = fcmp olt <2 x half> %a.val, %b.val 522 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val 523 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 524 ret void 525} 526 527define amdgpu_kernel void @select_v2f16_imm_a( 528; SI-LABEL: select_v2f16_imm_a: 529; SI: ; %bb.0: ; %entry 530; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 531; SI-NEXT: s_mov_b32 s11, 0xf000 532; SI-NEXT: s_mov_b32 s10, -1 533; SI-NEXT: s_mov_b32 s14, s10 534; SI-NEXT: s_mov_b32 s15, s11 535; SI-NEXT: s_waitcnt lgkmcnt(0) 536; SI-NEXT: s_mov_b32 s12, s2 537; SI-NEXT: s_mov_b32 s16, s4 538; SI-NEXT: s_mov_b32 s17, s5 539; SI-NEXT: s_mov_b32 s4, s6 540; SI-NEXT: s_mov_b32 s5, s7 541; SI-NEXT: s_mov_b32 s13, s3 542; SI-NEXT: s_mov_b32 s6, s10 543; SI-NEXT: s_mov_b32 s7, s11 544; SI-NEXT: s_mov_b32 s18, s10 545; SI-NEXT: s_mov_b32 s19, s11 546; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 547; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 548; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 549; SI-NEXT: s_mov_b32 s2, 0x3f200000 550; SI-NEXT: s_mov_b32 s8, s0 551; SI-NEXT: s_mov_b32 s9, s1 552; SI-NEXT: s_waitcnt vmcnt(2) 553; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 554; SI-NEXT: s_waitcnt vmcnt(1) 555; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 556; SI-NEXT: s_waitcnt vmcnt(0) 557; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 558; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 559; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 560; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 561; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 562; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 563; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 564; SI-NEXT: v_cmp_lt_f32_e32 vcc, s2, v3 565; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc 566; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0 567; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 568; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 569; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 570; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 571; SI-NEXT: v_or_b32_e32 v0, v0, v1 572; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 573; SI-NEXT: s_endpgm 574; 575; VI-LABEL: select_v2f16_imm_a: 576; VI: ; %bb.0: ; %entry 577; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 578; VI-NEXT: s_mov_b32 s11, 0xf000 579; VI-NEXT: s_mov_b32 s10, -1 580; VI-NEXT: s_mov_b32 s14, s10 581; VI-NEXT: s_mov_b32 s15, s11 582; VI-NEXT: s_waitcnt lgkmcnt(0) 583; VI-NEXT: s_mov_b32 s8, s0 584; VI-NEXT: s_mov_b32 s9, s1 585; VI-NEXT: s_mov_b32 s0, s2 586; VI-NEXT: s_mov_b32 s1, s3 587; VI-NEXT: s_mov_b32 s12, s4 588; VI-NEXT: s_mov_b32 s13, s5 589; VI-NEXT: s_mov_b32 s4, s6 590; VI-NEXT: s_mov_b32 s5, s7 591; VI-NEXT: s_mov_b32 s2, s10 592; VI-NEXT: s_mov_b32 s3, s11 593; VI-NEXT: s_mov_b32 s6, s10 594; VI-NEXT: s_mov_b32 s7, s11 595; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 596; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 597; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 598; VI-NEXT: s_movk_i32 s0, 0x3900 599; VI-NEXT: s_waitcnt vmcnt(2) 600; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 601; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0 602; VI-NEXT: s_waitcnt vmcnt(0) 603; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 604; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 605; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 606; VI-NEXT: v_cmp_lt_f16_e32 vcc, s0, v3 607; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 608; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 609; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 610; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 611; VI-NEXT: s_endpgm 612 <2 x half> addrspace(1)* %r, 613 <2 x half> addrspace(1)* %b, 614 <2 x half> addrspace(1)* %c, 615 <2 x half> addrspace(1)* %d) { 616entry: 617 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 618 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 619 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 620 %fcmp = fcmp olt <2 x half> <half 0xH3800, half 0xH3900>, %b.val 621 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val 622 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 623 ret void 624} 625 626define amdgpu_kernel void @select_v2f16_imm_b( 627; SI-LABEL: select_v2f16_imm_b: 628; SI: ; %bb.0: ; %entry 629; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 630; SI-NEXT: s_mov_b32 s11, 0xf000 631; SI-NEXT: s_mov_b32 s10, -1 632; SI-NEXT: s_mov_b32 s14, s10 633; SI-NEXT: s_mov_b32 s15, s11 634; SI-NEXT: s_waitcnt lgkmcnt(0) 635; SI-NEXT: s_mov_b32 s12, s2 636; SI-NEXT: s_mov_b32 s16, s4 637; SI-NEXT: s_mov_b32 s17, s5 638; SI-NEXT: s_mov_b32 s4, s6 639; SI-NEXT: s_mov_b32 s5, s7 640; SI-NEXT: s_mov_b32 s13, s3 641; SI-NEXT: s_mov_b32 s6, s10 642; SI-NEXT: s_mov_b32 s7, s11 643; SI-NEXT: s_mov_b32 s18, s10 644; SI-NEXT: s_mov_b32 s19, s11 645; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 646; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 647; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 648; SI-NEXT: s_mov_b32 s2, 0x3f200000 649; SI-NEXT: s_mov_b32 s8, s0 650; SI-NEXT: s_mov_b32 s9, s1 651; SI-NEXT: s_waitcnt vmcnt(2) 652; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 653; SI-NEXT: s_waitcnt vmcnt(1) 654; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 655; SI-NEXT: s_waitcnt vmcnt(0) 656; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 657; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 658; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 659; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 660; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 661; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 662; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 663; SI-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3 664; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc 665; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0 666; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 667; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 668; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 669; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 670; SI-NEXT: v_or_b32_e32 v0, v0, v1 671; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 672; SI-NEXT: s_endpgm 673; 674; VI-LABEL: select_v2f16_imm_b: 675; VI: ; %bb.0: ; %entry 676; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 677; VI-NEXT: s_mov_b32 s11, 0xf000 678; VI-NEXT: s_mov_b32 s10, -1 679; VI-NEXT: s_mov_b32 s14, s10 680; VI-NEXT: s_mov_b32 s15, s11 681; VI-NEXT: s_waitcnt lgkmcnt(0) 682; VI-NEXT: s_mov_b32 s8, s0 683; VI-NEXT: s_mov_b32 s9, s1 684; VI-NEXT: s_mov_b32 s0, s2 685; VI-NEXT: s_mov_b32 s1, s3 686; VI-NEXT: s_mov_b32 s12, s4 687; VI-NEXT: s_mov_b32 s13, s5 688; VI-NEXT: s_mov_b32 s4, s6 689; VI-NEXT: s_mov_b32 s5, s7 690; VI-NEXT: s_mov_b32 s2, s10 691; VI-NEXT: s_mov_b32 s3, s11 692; VI-NEXT: s_mov_b32 s6, s10 693; VI-NEXT: s_mov_b32 s7, s11 694; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 695; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 696; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 697; VI-NEXT: s_movk_i32 s0, 0x3900 698; VI-NEXT: s_waitcnt vmcnt(2) 699; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 700; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0 701; VI-NEXT: s_waitcnt vmcnt(0) 702; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 703; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 704; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 705; VI-NEXT: v_cmp_gt_f16_e32 vcc, s0, v3 706; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 707; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 708; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 709; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 710; VI-NEXT: s_endpgm 711 <2 x half> addrspace(1)* %r, 712 <2 x half> addrspace(1)* %a, 713 <2 x half> addrspace(1)* %c, 714 <2 x half> addrspace(1)* %d) { 715entry: 716 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 717 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 718 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 719 %fcmp = fcmp olt <2 x half> %a.val, <half 0xH3800, half 0xH3900> 720 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val 721 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 722 ret void 723} 724 725define amdgpu_kernel void @select_v2f16_imm_c( 726; SI-LABEL: select_v2f16_imm_c: 727; SI: ; %bb.0: ; %entry 728; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 729; SI-NEXT: s_mov_b32 s11, 0xf000 730; SI-NEXT: s_mov_b32 s10, -1 731; SI-NEXT: s_mov_b32 s14, s10 732; SI-NEXT: s_mov_b32 s15, s11 733; SI-NEXT: s_waitcnt lgkmcnt(0) 734; SI-NEXT: s_mov_b32 s16, s4 735; SI-NEXT: s_mov_b32 s17, s5 736; SI-NEXT: s_mov_b32 s12, s2 737; SI-NEXT: s_mov_b32 s13, s3 738; SI-NEXT: s_mov_b32 s18, s10 739; SI-NEXT: s_mov_b32 s19, s11 740; SI-NEXT: s_mov_b32 s4, s6 741; SI-NEXT: s_mov_b32 s5, s7 742; SI-NEXT: s_mov_b32 s6, s10 743; SI-NEXT: s_mov_b32 s7, s11 744; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 745; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 746; SI-NEXT: buffer_load_dword v3, off, s[16:19], 0 747; SI-NEXT: v_mov_b32_e32 v2, 0x3f200000 748; SI-NEXT: s_mov_b32 s8, s0 749; SI-NEXT: s_mov_b32 s9, s1 750; SI-NEXT: s_waitcnt vmcnt(2) 751; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 752; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 753; SI-NEXT: s_waitcnt vmcnt(0) 754; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 755; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 756; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 757; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 758; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 759; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 760; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 761; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v5 762; SI-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc 763; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v3 764; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 765; SI-NEXT: v_cndmask_b32_e32 v1, 0.5, v1, vcc 766; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 767; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 768; SI-NEXT: v_or_b32_e32 v0, v1, v0 769; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 770; SI-NEXT: s_endpgm 771; 772; VI-LABEL: select_v2f16_imm_c: 773; VI: ; %bb.0: ; %entry 774; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 775; VI-NEXT: s_mov_b32 s11, 0xf000 776; VI-NEXT: s_mov_b32 s10, -1 777; VI-NEXT: s_mov_b32 s14, s10 778; VI-NEXT: s_mov_b32 s15, s11 779; VI-NEXT: s_waitcnt lgkmcnt(0) 780; VI-NEXT: s_mov_b32 s8, s0 781; VI-NEXT: s_mov_b32 s9, s1 782; VI-NEXT: s_mov_b32 s0, s2 783; VI-NEXT: s_mov_b32 s1, s3 784; VI-NEXT: s_mov_b32 s12, s4 785; VI-NEXT: s_mov_b32 s13, s5 786; VI-NEXT: s_mov_b32 s2, s10 787; VI-NEXT: s_mov_b32 s3, s11 788; VI-NEXT: s_mov_b32 s4, s6 789; VI-NEXT: s_mov_b32 s5, s7 790; VI-NEXT: s_mov_b32 s6, s10 791; VI-NEXT: s_mov_b32 s7, s11 792; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 793; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0 794; VI-NEXT: buffer_load_dword v4, off, s[12:15], 0 795; VI-NEXT: v_mov_b32_e32 v2, 0x3800 796; VI-NEXT: v_mov_b32_e32 v3, 0x3900 797; VI-NEXT: s_waitcnt vmcnt(2) 798; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 799; VI-NEXT: s_waitcnt vmcnt(0) 800; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v4 801; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 802; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 803; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 804; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v6, v5 805; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 806; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 807; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 808; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 809; VI-NEXT: s_endpgm 810 <2 x half> addrspace(1)* %r, 811 <2 x half> addrspace(1)* %a, 812 <2 x half> addrspace(1)* %b, 813 <2 x half> addrspace(1)* %d) { 814entry: 815 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 816 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 817 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 818 %fcmp = fcmp olt <2 x half> %a.val, %b.val 819 %r.val = select <2 x i1> %fcmp, <2 x half> <half 0xH3800, half 0xH3900>, <2 x half> %d.val 820 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 821 ret void 822} 823 824define amdgpu_kernel void @select_v2f16_imm_d( 825; SI-LABEL: select_v2f16_imm_d: 826; SI: ; %bb.0: ; %entry 827; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 828; SI-NEXT: s_mov_b32 s11, 0xf000 829; SI-NEXT: s_mov_b32 s10, -1 830; SI-NEXT: s_mov_b32 s14, s10 831; SI-NEXT: s_mov_b32 s15, s11 832; SI-NEXT: s_waitcnt lgkmcnt(0) 833; SI-NEXT: s_mov_b32 s16, s4 834; SI-NEXT: s_mov_b32 s17, s5 835; SI-NEXT: s_mov_b32 s12, s2 836; SI-NEXT: s_mov_b32 s13, s3 837; SI-NEXT: s_mov_b32 s18, s10 838; SI-NEXT: s_mov_b32 s19, s11 839; SI-NEXT: s_mov_b32 s4, s6 840; SI-NEXT: s_mov_b32 s5, s7 841; SI-NEXT: s_mov_b32 s6, s10 842; SI-NEXT: s_mov_b32 s7, s11 843; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 844; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 845; SI-NEXT: buffer_load_dword v3, off, s[16:19], 0 846; SI-NEXT: v_mov_b32_e32 v2, 0x3f200000 847; SI-NEXT: s_mov_b32 s8, s0 848; SI-NEXT: s_mov_b32 s9, s1 849; SI-NEXT: s_waitcnt vmcnt(2) 850; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 851; SI-NEXT: s_waitcnt vmcnt(1) 852; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 853; SI-NEXT: s_waitcnt vmcnt(0) 854; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 855; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 856; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 857; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 858; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 859; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 860; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 861; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5 862; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 863; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 864; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v1, vcc 865; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 866; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 867; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 868; SI-NEXT: v_or_b32_e32 v0, v0, v1 869; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 870; SI-NEXT: s_endpgm 871; 872; VI-LABEL: select_v2f16_imm_d: 873; VI: ; %bb.0: ; %entry 874; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 875; VI-NEXT: s_mov_b32 s11, 0xf000 876; VI-NEXT: s_mov_b32 s10, -1 877; VI-NEXT: s_mov_b32 s14, s10 878; VI-NEXT: s_mov_b32 s15, s11 879; VI-NEXT: s_waitcnt lgkmcnt(0) 880; VI-NEXT: s_mov_b32 s8, s0 881; VI-NEXT: s_mov_b32 s9, s1 882; VI-NEXT: s_mov_b32 s0, s2 883; VI-NEXT: s_mov_b32 s1, s3 884; VI-NEXT: s_mov_b32 s12, s4 885; VI-NEXT: s_mov_b32 s13, s5 886; VI-NEXT: s_mov_b32 s2, s10 887; VI-NEXT: s_mov_b32 s3, s11 888; VI-NEXT: s_mov_b32 s4, s6 889; VI-NEXT: s_mov_b32 s5, s7 890; VI-NEXT: s_mov_b32 s6, s10 891; VI-NEXT: s_mov_b32 s7, s11 892; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 893; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0 894; VI-NEXT: buffer_load_dword v4, off, s[12:15], 0 895; VI-NEXT: v_mov_b32_e32 v2, 0x3800 896; VI-NEXT: v_mov_b32_e32 v3, 0x3900 897; VI-NEXT: s_waitcnt vmcnt(2) 898; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 899; VI-NEXT: s_waitcnt vmcnt(0) 900; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v4 901; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 902; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 903; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 904; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5 905; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 906; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 907; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 908; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 909; VI-NEXT: s_endpgm 910 <2 x half> addrspace(1)* %r, 911 <2 x half> addrspace(1)* %a, 912 <2 x half> addrspace(1)* %b, 913 <2 x half> addrspace(1)* %c) { 914entry: 915 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 916 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 917 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 918 %fcmp = fcmp olt <2 x half> %a.val, %b.val 919 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> <half 0xH3800, half 0xH3900> 920 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 921 ret void 922} 923