1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck -check-prefix=SI %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s 4 5define <4 x i16> @extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspace(1) * %p1) { 6; SI-LABEL: extract_4xi16: 7; SI: ; %bb.0: 8; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9; SI-NEXT: s_cbranch_scc0 .LBB0_2 10; SI-NEXT: ; %bb.1: ; %F 11; SI-NEXT: s_mov_b32 s6, 0 12; SI-NEXT: s_mov_b32 s7, 0xf000 13; SI-NEXT: s_mov_b32 s4, s6 14; SI-NEXT: s_mov_b32 s5, s6 15; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc 16; SI-NEXT: s_waitcnt vmcnt(0) 17; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc 18; SI-NEXT: s_waitcnt vmcnt(0) 19; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc 20; SI-NEXT: s_waitcnt vmcnt(0) 21; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc 22; SI-NEXT: s_waitcnt vmcnt(0) 23; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc 24; SI-NEXT: s_waitcnt vmcnt(0) 25; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc 26; SI-NEXT: s_waitcnt vmcnt(0) 27; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc 28; SI-NEXT: s_waitcnt vmcnt(0) 29; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc 30; SI-NEXT: s_waitcnt vmcnt(0) 31; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 32; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 33; SI-NEXT: v_or_b32_e32 v2, v6, v2 34; SI-NEXT: v_or_b32_e32 v3, v4, v3 35; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 36; SI-NEXT: s_mov_b64 vcc, exec 37; SI-NEXT: s_cbranch_execz .LBB0_3 38; SI-NEXT: s_branch .LBB0_4 39; SI-NEXT: .LBB0_2: 40; SI-NEXT: ; implicit-def: $vgpr3 41; SI-NEXT: ; implicit-def: $vgpr4 42; SI-NEXT: ; implicit-def: $vgpr2 43; SI-NEXT: s_mov_b64 vcc, 0 44; SI-NEXT: .LBB0_3: ; %T 45; SI-NEXT: s_mov_b32 s6, 0 46; SI-NEXT: s_mov_b32 s7, 0xf000 47; SI-NEXT: s_mov_b32 s4, s6 48; SI-NEXT: s_mov_b32 s5, s6 49; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc 50; SI-NEXT: s_waitcnt vmcnt(0) 51; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc 52; SI-NEXT: s_waitcnt vmcnt(0) 53; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc 54; SI-NEXT: s_waitcnt vmcnt(0) 55; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc 56; SI-NEXT: s_waitcnt vmcnt(0) 57; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc 58; SI-NEXT: s_waitcnt vmcnt(0) 59; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc 60; SI-NEXT: s_waitcnt vmcnt(0) 61; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc 62; SI-NEXT: s_waitcnt vmcnt(0) 63; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc 64; SI-NEXT: s_waitcnt vmcnt(0) 65; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 66; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 67; SI-NEXT: v_or_b32_e32 v2, v4, v0 68; SI-NEXT: v_or_b32_e32 v3, v3, v1 69; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 70; SI-NEXT: .LBB0_4: ; %exit 71; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 72; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 73; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 74; SI-NEXT: v_mov_b32_e32 v3, 0xffff 75; SI-NEXT: v_mov_b32_e32 v4, 0x8000 76; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000 77; SI-NEXT: v_bfrev_b32_e32 v6, 1 78; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000 79; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 80; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 81; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 82; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc 83; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 84; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc 85; SI-NEXT: v_or_b32_e32 v0, v0, v1 86; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 87; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 88; SI-NEXT: v_or_b32_e32 v2, v2, v3 89; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 90; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 91; SI-NEXT: s_setpc_b64 s[30:31] 92; 93; GFX9-LABEL: extract_4xi16: 94; GFX9: ; %bb.0: 95; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 96; GFX9-NEXT: s_cbranch_scc0 .LBB0_2 97; GFX9-NEXT: ; %bb.1: ; %F 98; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc 99; GFX9-NEXT: s_waitcnt vmcnt(0) 100; GFX9-NEXT: s_cbranch_execz .LBB0_3 101; GFX9-NEXT: s_branch .LBB0_4 102; GFX9-NEXT: .LBB0_2: 103; GFX9-NEXT: s_mov_b32 s8, 0 104; GFX9-NEXT: s_mov_b32 s9, s8 105; GFX9-NEXT: s_mov_b32 s10, s8 106; GFX9-NEXT: s_mov_b32 s11, s8 107; GFX9-NEXT: v_mov_b32_e32 v2, s8 108; GFX9-NEXT: v_mov_b32_e32 v3, s9 109; GFX9-NEXT: v_mov_b32_e32 v4, s10 110; GFX9-NEXT: v_mov_b32_e32 v5, s11 111; GFX9-NEXT: .LBB0_3: ; %T 112; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc 113; GFX9-NEXT: s_waitcnt vmcnt(0) 114; GFX9-NEXT: .LBB0_4: ; %exit 115; GFX9-NEXT: s_waitcnt vmcnt(0) 116; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0] 117; GFX9-NEXT: s_movk_i32 s4, 0x8000 118; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 119; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v0 120; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1] 121; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 122; GFX9-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 123; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 124; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 125; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 126; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 127; GFX9-NEXT: s_setpc_b64 s[30:31] 128 br i1 undef, label %T, label %F 129 130T: 131 %t = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p0 132 br label %exit 133 134F: 135 %f = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p1 136 br label %exit 137 138exit: 139 %m = phi <8 x i16> [ %t, %T ], [ %f, %F ] 140 %v2 = shufflevector <8 x i16> %m, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 141 %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1> 142 %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1> 143 ret <4 x i16> %r2 144} 145 146define <4 x i16> @extract_4xi16_2(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspace(1) * %p1) { 147; SI-LABEL: extract_4xi16_2: 148; SI: ; %bb.0: 149; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 150; SI-NEXT: s_cbranch_scc0 .LBB1_2 151; SI-NEXT: ; %bb.1: ; %F 152; SI-NEXT: s_mov_b32 s6, 0 153; SI-NEXT: s_mov_b32 s7, 0xf000 154; SI-NEXT: s_mov_b32 s4, s6 155; SI-NEXT: s_mov_b32 s5, s6 156; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc 157; SI-NEXT: s_waitcnt vmcnt(0) 158; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc 159; SI-NEXT: s_waitcnt vmcnt(0) 160; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:4 glc 161; SI-NEXT: s_waitcnt vmcnt(0) 162; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc 163; SI-NEXT: s_waitcnt vmcnt(0) 164; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc 165; SI-NEXT: s_waitcnt vmcnt(0) 166; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc 167; SI-NEXT: s_waitcnt vmcnt(0) 168; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc 169; SI-NEXT: s_waitcnt vmcnt(0) 170; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc 171; SI-NEXT: s_waitcnt vmcnt(0) 172; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 173; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 174; SI-NEXT: v_or_b32_e32 v2, v6, v2 175; SI-NEXT: v_or_b32_e32 v3, v4, v3 176; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 177; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 178; SI-NEXT: s_mov_b64 vcc, exec 179; SI-NEXT: s_cbranch_execz .LBB1_3 180; SI-NEXT: s_branch .LBB1_4 181; SI-NEXT: .LBB1_2: 182; SI-NEXT: ; implicit-def: $vgpr3 183; SI-NEXT: ; implicit-def: $vgpr5 184; SI-NEXT: ; implicit-def: $vgpr2 185; SI-NEXT: ; implicit-def: $vgpr4 186; SI-NEXT: s_mov_b64 vcc, 0 187; SI-NEXT: .LBB1_3: ; %T 188; SI-NEXT: s_mov_b32 s6, 0 189; SI-NEXT: s_mov_b32 s7, 0xf000 190; SI-NEXT: s_mov_b32 s4, s6 191; SI-NEXT: s_mov_b32 s5, s6 192; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 193; SI-NEXT: s_waitcnt vmcnt(0) 194; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc 195; SI-NEXT: s_waitcnt vmcnt(0) 196; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc 197; SI-NEXT: s_waitcnt vmcnt(0) 198; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc 199; SI-NEXT: s_waitcnt vmcnt(0) 200; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc 201; SI-NEXT: s_waitcnt vmcnt(0) 202; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:10 glc 203; SI-NEXT: s_waitcnt vmcnt(0) 204; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc 205; SI-NEXT: s_waitcnt vmcnt(0) 206; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc 207; SI-NEXT: s_waitcnt vmcnt(0) 208; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 209; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 210; SI-NEXT: v_or_b32_e32 v2, v4, v0 211; SI-NEXT: v_or_b32_e32 v3, v3, v1 212; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 213; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 214; SI-NEXT: .LBB1_4: ; %exit 215; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 216; SI-NEXT: v_bfe_i32 v1, v5, 0, 16 217; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 218; SI-NEXT: v_bfe_i32 v3, v4, 0, 16 219; SI-NEXT: v_mov_b32_e32 v4, 0xffff 220; SI-NEXT: v_mov_b32_e32 v5, 0x8000 221; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000 222; SI-NEXT: v_bfrev_b32_e32 v7, 1 223; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 224; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 225; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 226; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc 227; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 228; SI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc 229; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3 230; SI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc 231; SI-NEXT: v_or_b32_e32 v0, v0, v1 232; SI-NEXT: v_or_b32_e32 v2, v2, v3 233; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 234; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 235; SI-NEXT: s_setpc_b64 s[30:31] 236; 237; GFX9-LABEL: extract_4xi16_2: 238; GFX9: ; %bb.0: 239; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 240; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 241; GFX9-NEXT: ; %bb.1: ; %F 242; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc 243; GFX9-NEXT: s_waitcnt vmcnt(0) 244; GFX9-NEXT: s_cbranch_execz .LBB1_3 245; GFX9-NEXT: s_branch .LBB1_4 246; GFX9-NEXT: .LBB1_2: 247; GFX9-NEXT: s_mov_b32 s8, 0 248; GFX9-NEXT: s_mov_b32 s9, s8 249; GFX9-NEXT: s_mov_b32 s10, s8 250; GFX9-NEXT: s_mov_b32 s11, s8 251; GFX9-NEXT: v_mov_b32_e32 v2, s8 252; GFX9-NEXT: v_mov_b32_e32 v3, s9 253; GFX9-NEXT: v_mov_b32_e32 v4, s10 254; GFX9-NEXT: v_mov_b32_e32 v5, s11 255; GFX9-NEXT: .LBB1_3: ; %T 256; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc 257; GFX9-NEXT: s_waitcnt vmcnt(0) 258; GFX9-NEXT: .LBB1_4: ; %exit 259; GFX9-NEXT: s_waitcnt vmcnt(0) 260; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1] 261; GFX9-NEXT: s_movk_i32 s4, 0x8000 262; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 263; GFX9-NEXT: v_or_b32_e32 v2, 0xffff8000, v0 264; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1] 265; GFX9-NEXT: v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 266; GFX9-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 267; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 268; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 269; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 270; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 271; GFX9-NEXT: s_setpc_b64 s[30:31] 272 br i1 undef, label %T, label %F 273 274T: 275 %t = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p0 276 br label %exit 277 278F: 279 %f = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p1 280 br label %exit 281 282exit: 283 %m = phi <8 x i16> [ %t, %T ], [ %f, %F ] 284 %v2 = shufflevector <8 x i16> %m, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 285 %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1> 286 %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1> 287 ret <4 x i16> %r2 288} 289 290define <4 x half> @extract_4xf16(<8 x half> addrspace(1) * %p0, <8 x half> addrspace(1) * %p1) { 291; SI-LABEL: extract_4xf16: 292; SI: ; %bb.0: 293; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 294; SI-NEXT: s_cbranch_scc0 .LBB2_2 295; SI-NEXT: ; %bb.1: ; %F 296; SI-NEXT: s_mov_b32 s6, 0 297; SI-NEXT: s_mov_b32 s7, 0xf000 298; SI-NEXT: s_mov_b32 s4, s6 299; SI-NEXT: s_mov_b32 s5, s6 300; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc 301; SI-NEXT: s_waitcnt vmcnt(0) 302; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc 303; SI-NEXT: s_waitcnt vmcnt(0) 304; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc 305; SI-NEXT: s_waitcnt vmcnt(0) 306; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc 307; SI-NEXT: s_waitcnt vmcnt(0) 308; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc 309; SI-NEXT: s_waitcnt vmcnt(0) 310; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc 311; SI-NEXT: s_waitcnt vmcnt(0) 312; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc 313; SI-NEXT: s_waitcnt vmcnt(0) 314; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc 315; SI-NEXT: s_waitcnt vmcnt(0) 316; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 317; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 318; SI-NEXT: v_or_b32_e32 v2, v6, v2 319; SI-NEXT: v_or_b32_e32 v4, v4, v3 320; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 321; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 322; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 323; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 324; SI-NEXT: s_mov_b64 vcc, exec 325; SI-NEXT: s_cbranch_execz .LBB2_3 326; SI-NEXT: s_branch .LBB2_4 327; SI-NEXT: .LBB2_2: 328; SI-NEXT: ; implicit-def: $vgpr3 329; SI-NEXT: ; implicit-def: $vgpr4 330; SI-NEXT: ; implicit-def: $vgpr2 331; SI-NEXT: s_mov_b64 vcc, 0 332; SI-NEXT: .LBB2_3: ; %T 333; SI-NEXT: s_mov_b32 s6, 0 334; SI-NEXT: s_mov_b32 s7, 0xf000 335; SI-NEXT: s_mov_b32 s4, s6 336; SI-NEXT: s_mov_b32 s5, s6 337; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 338; SI-NEXT: s_waitcnt vmcnt(0) 339; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc 340; SI-NEXT: s_waitcnt vmcnt(0) 341; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc 342; SI-NEXT: s_waitcnt vmcnt(0) 343; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc 344; SI-NEXT: s_waitcnt vmcnt(0) 345; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc 346; SI-NEXT: s_waitcnt vmcnt(0) 347; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc 348; SI-NEXT: s_waitcnt vmcnt(0) 349; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc 350; SI-NEXT: s_waitcnt vmcnt(0) 351; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc 352; SI-NEXT: s_waitcnt vmcnt(0) 353; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 354; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 355; SI-NEXT: v_or_b32_e32 v0, v4, v0 356; SI-NEXT: v_or_b32_e32 v1, v2, v1 357; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 358; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 359; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 360; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 361; SI-NEXT: .LBB2_4: ; %exit 362; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 363; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 364; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 365; SI-NEXT: v_mov_b32_e32 v3, 0x3fa00000 366; SI-NEXT: v_mov_b32_e32 v4, 0x3f200000 367; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 368; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 369; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 370; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 371; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 372; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 373; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 374; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2 375; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 376; SI-NEXT: v_mov_b32_e32 v3, v2 377; SI-NEXT: s_setpc_b64 s[30:31] 378; 379; GFX9-LABEL: extract_4xf16: 380; GFX9: ; %bb.0: 381; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 382; GFX9-NEXT: s_cbranch_scc0 .LBB2_2 383; GFX9-NEXT: ; %bb.1: ; %F 384; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc 385; GFX9-NEXT: s_waitcnt vmcnt(0) 386; GFX9-NEXT: s_cbranch_execz .LBB2_3 387; GFX9-NEXT: s_branch .LBB2_4 388; GFX9-NEXT: .LBB2_2: 389; GFX9-NEXT: s_mov_b32 s8, 0 390; GFX9-NEXT: s_mov_b32 s9, s8 391; GFX9-NEXT: s_mov_b32 s10, s8 392; GFX9-NEXT: s_mov_b32 s11, s8 393; GFX9-NEXT: v_mov_b32_e32 v2, s8 394; GFX9-NEXT: v_mov_b32_e32 v3, s9 395; GFX9-NEXT: v_mov_b32_e32 v4, s10 396; GFX9-NEXT: v_mov_b32_e32 v5, s11 397; GFX9-NEXT: .LBB2_3: ; %T 398; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc 399; GFX9-NEXT: s_waitcnt vmcnt(0) 400; GFX9-NEXT: .LBB2_4: ; %exit 401; GFX9-NEXT: s_waitcnt vmcnt(0) 402; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 403; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 404; GFX9-NEXT: v_mov_b32_e32 v1, 0x3800 405; GFX9-NEXT: v_mov_b32_e32 v3, 0x3900 406; GFX9-NEXT: v_mov_b32_e32 v4, 0x3d00 407; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v0 408; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v4, vcc 409; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:DWORD 410; GFX9-NEXT: v_cndmask_b32_e32 v6, v4, v3, vcc 411; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v2 412; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 413; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v2, v1 src0_sel:WORD_1 src1_sel:DWORD 414; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 415; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 416; GFX9-NEXT: v_pack_b32_f16 v1, v5, v6 417; GFX9-NEXT: s_setpc_b64 s[30:31] 418 br i1 undef, label %T, label %F 419 420T: 421 %t = load volatile <8 x half>, <8 x half> addrspace(1) * %p0 422 br label %exit 423 424F: 425 %f = load volatile <8 x half>, <8 x half> addrspace(1) * %p1 426 br label %exit 427 428exit: 429 %m = phi <8 x half> [ %t, %T ], [ %f, %F ] 430 %v2 = shufflevector <8 x half> %m, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 431 %b2 = fcmp ugt <4 x half> %v2, <half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800> 432 %r2 = select <4 x i1> %b2, <4 x half> <half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900>, <4 x half> <half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00> 433 ret <4 x half> %r2 434} 435