1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck -check-prefix=SI %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s 4 5define <4 x i16> @vec_8xi16_extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspace(1) * %p1) { 6; SI-LABEL: vec_8xi16_extract_4xi16: 7; SI: ; %bb.0: 8; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9; SI-NEXT: s_cbranch_scc0 .LBB0_2 10; SI-NEXT: ; %bb.1: ; %F 11; SI-NEXT: s_mov_b32 s6, 0 12; SI-NEXT: s_mov_b32 s7, 0xf000 13; SI-NEXT: s_mov_b32 s4, s6 14; SI-NEXT: s_mov_b32 s5, s6 15; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc 16; SI-NEXT: s_waitcnt vmcnt(0) 17; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc 18; SI-NEXT: s_waitcnt vmcnt(0) 19; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc 20; SI-NEXT: s_waitcnt vmcnt(0) 21; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc 22; SI-NEXT: s_waitcnt vmcnt(0) 23; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc 24; SI-NEXT: s_waitcnt vmcnt(0) 25; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc 26; SI-NEXT: s_waitcnt vmcnt(0) 27; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc 28; SI-NEXT: s_waitcnt vmcnt(0) 29; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc 30; SI-NEXT: s_waitcnt vmcnt(0) 31; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 32; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 33; SI-NEXT: v_or_b32_e32 v2, v6, v2 34; SI-NEXT: v_or_b32_e32 v3, v4, v3 35; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 36; SI-NEXT: s_mov_b64 vcc, exec 37; SI-NEXT: s_cbranch_execz .LBB0_3 38; SI-NEXT: s_branch .LBB0_4 39; SI-NEXT: .LBB0_2: 40; SI-NEXT: ; implicit-def: $vgpr3 41; SI-NEXT: ; implicit-def: $vgpr4 42; SI-NEXT: ; implicit-def: $vgpr2 43; SI-NEXT: s_mov_b64 vcc, 0 44; SI-NEXT: .LBB0_3: ; %T 45; SI-NEXT: s_mov_b32 s6, 0 46; SI-NEXT: s_mov_b32 s7, 0xf000 47; SI-NEXT: s_mov_b32 s4, s6 48; SI-NEXT: s_mov_b32 s5, s6 49; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc 50; SI-NEXT: s_waitcnt vmcnt(0) 51; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc 52; SI-NEXT: s_waitcnt vmcnt(0) 53; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc 54; SI-NEXT: s_waitcnt vmcnt(0) 55; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc 56; SI-NEXT: s_waitcnt vmcnt(0) 57; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc 58; SI-NEXT: s_waitcnt vmcnt(0) 59; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc 60; SI-NEXT: s_waitcnt vmcnt(0) 61; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc 62; SI-NEXT: s_waitcnt vmcnt(0) 63; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc 64; SI-NEXT: s_waitcnt vmcnt(0) 65; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 66; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 67; SI-NEXT: v_or_b32_e32 v2, v4, v0 68; SI-NEXT: v_or_b32_e32 v3, v3, v1 69; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 70; SI-NEXT: .LBB0_4: ; %exit 71; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 72; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 73; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 74; SI-NEXT: v_mov_b32_e32 v3, 0xffff 75; SI-NEXT: v_mov_b32_e32 v4, 0x8000 76; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000 77; SI-NEXT: v_bfrev_b32_e32 v6, 1 78; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000 79; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 80; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 81; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 82; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc 83; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 84; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc 85; SI-NEXT: v_or_b32_e32 v0, v0, v1 86; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 87; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 88; SI-NEXT: v_or_b32_e32 v2, v2, v3 89; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 90; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 91; SI-NEXT: s_setpc_b64 s[30:31] 92; 93; GFX9-LABEL: vec_8xi16_extract_4xi16: 94; GFX9: ; %bb.0: 95; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 96; GFX9-NEXT: s_cbranch_scc0 .LBB0_2 97; GFX9-NEXT: ; %bb.1: ; %F 98; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc 99; GFX9-NEXT: s_waitcnt vmcnt(0) 100; GFX9-NEXT: s_cbranch_execz .LBB0_3 101; GFX9-NEXT: s_branch .LBB0_4 102; GFX9-NEXT: .LBB0_2: 103; GFX9-NEXT: s_mov_b32 s8, 0 104; GFX9-NEXT: s_mov_b32 s9, s8 105; GFX9-NEXT: s_mov_b32 s10, s8 106; GFX9-NEXT: s_mov_b32 s11, s8 107; GFX9-NEXT: v_mov_b32_e32 v2, s8 108; GFX9-NEXT: v_mov_b32_e32 v3, s9 109; GFX9-NEXT: v_mov_b32_e32 v4, s10 110; GFX9-NEXT: v_mov_b32_e32 v5, s11 111; GFX9-NEXT: .LBB0_3: ; %T 112; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc 113; GFX9-NEXT: s_waitcnt vmcnt(0) 114; GFX9-NEXT: .LBB0_4: ; %exit 115; GFX9-NEXT: s_waitcnt vmcnt(0) 116; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0] 117; GFX9-NEXT: s_movk_i32 s4, 0x8000 118; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 119; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v0 120; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1] 121; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 122; GFX9-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 123; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 124; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 125; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 126; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 127; GFX9-NEXT: s_setpc_b64 s[30:31] 128 br i1 undef, label %T, label %F 129 130T: 131 %t = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p0 132 br label %exit 133 134F: 135 %f = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p1 136 br label %exit 137 138exit: 139 %m = phi <8 x i16> [ %t, %T ], [ %f, %F ] 140 %v2 = shufflevector <8 x i16> %m, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 141 %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1> 142 %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1> 143 ret <4 x i16> %r2 144} 145 146define <4 x i16> @vec_8xi16_extract_4xi16_2(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspace(1) * %p1) { 147; SI-LABEL: vec_8xi16_extract_4xi16_2: 148; SI: ; %bb.0: 149; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 150; SI-NEXT: s_cbranch_scc0 .LBB1_2 151; SI-NEXT: ; %bb.1: ; %F 152; SI-NEXT: s_mov_b32 s6, 0 153; SI-NEXT: s_mov_b32 s7, 0xf000 154; SI-NEXT: s_mov_b32 s4, s6 155; SI-NEXT: s_mov_b32 s5, s6 156; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc 157; SI-NEXT: s_waitcnt vmcnt(0) 158; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc 159; SI-NEXT: s_waitcnt vmcnt(0) 160; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:4 glc 161; SI-NEXT: s_waitcnt vmcnt(0) 162; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc 163; SI-NEXT: s_waitcnt vmcnt(0) 164; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc 165; SI-NEXT: s_waitcnt vmcnt(0) 166; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc 167; SI-NEXT: s_waitcnt vmcnt(0) 168; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc 169; SI-NEXT: s_waitcnt vmcnt(0) 170; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc 171; SI-NEXT: s_waitcnt vmcnt(0) 172; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 173; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 174; SI-NEXT: v_or_b32_e32 v2, v6, v2 175; SI-NEXT: v_or_b32_e32 v3, v4, v3 176; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 177; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 178; SI-NEXT: s_mov_b64 vcc, exec 179; SI-NEXT: s_cbranch_execz .LBB1_3 180; SI-NEXT: s_branch .LBB1_4 181; SI-NEXT: .LBB1_2: 182; SI-NEXT: ; implicit-def: $vgpr3 183; SI-NEXT: ; implicit-def: $vgpr5 184; SI-NEXT: ; implicit-def: $vgpr2 185; SI-NEXT: ; implicit-def: $vgpr4 186; SI-NEXT: s_mov_b64 vcc, 0 187; SI-NEXT: .LBB1_3: ; %T 188; SI-NEXT: s_mov_b32 s6, 0 189; SI-NEXT: s_mov_b32 s7, 0xf000 190; SI-NEXT: s_mov_b32 s4, s6 191; SI-NEXT: s_mov_b32 s5, s6 192; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 193; SI-NEXT: s_waitcnt vmcnt(0) 194; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc 195; SI-NEXT: s_waitcnt vmcnt(0) 196; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc 197; SI-NEXT: s_waitcnt vmcnt(0) 198; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc 199; SI-NEXT: s_waitcnt vmcnt(0) 200; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc 201; SI-NEXT: s_waitcnt vmcnt(0) 202; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:10 glc 203; SI-NEXT: s_waitcnt vmcnt(0) 204; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc 205; SI-NEXT: s_waitcnt vmcnt(0) 206; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc 207; SI-NEXT: s_waitcnt vmcnt(0) 208; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 209; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 210; SI-NEXT: v_or_b32_e32 v2, v4, v0 211; SI-NEXT: v_or_b32_e32 v3, v3, v1 212; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 213; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 214; SI-NEXT: .LBB1_4: ; %exit 215; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 216; SI-NEXT: v_bfe_i32 v1, v5, 0, 16 217; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 218; SI-NEXT: v_bfe_i32 v3, v4, 0, 16 219; SI-NEXT: v_mov_b32_e32 v4, 0xffff 220; SI-NEXT: v_mov_b32_e32 v5, 0x8000 221; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000 222; SI-NEXT: v_bfrev_b32_e32 v7, 1 223; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 224; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 225; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 226; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc 227; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 228; SI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc 229; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3 230; SI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc 231; SI-NEXT: v_or_b32_e32 v0, v0, v1 232; SI-NEXT: v_or_b32_e32 v2, v2, v3 233; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 234; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 235; SI-NEXT: s_setpc_b64 s[30:31] 236; 237; GFX9-LABEL: vec_8xi16_extract_4xi16_2: 238; GFX9: ; %bb.0: 239; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 240; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 241; GFX9-NEXT: ; %bb.1: ; %F 242; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc 243; GFX9-NEXT: s_waitcnt vmcnt(0) 244; GFX9-NEXT: s_cbranch_execz .LBB1_3 245; GFX9-NEXT: s_branch .LBB1_4 246; GFX9-NEXT: .LBB1_2: 247; GFX9-NEXT: s_mov_b32 s8, 0 248; GFX9-NEXT: s_mov_b32 s9, s8 249; GFX9-NEXT: s_mov_b32 s10, s8 250; GFX9-NEXT: s_mov_b32 s11, s8 251; GFX9-NEXT: v_mov_b32_e32 v2, s8 252; GFX9-NEXT: v_mov_b32_e32 v3, s9 253; GFX9-NEXT: v_mov_b32_e32 v4, s10 254; GFX9-NEXT: v_mov_b32_e32 v5, s11 255; GFX9-NEXT: .LBB1_3: ; %T 256; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc 257; GFX9-NEXT: s_waitcnt vmcnt(0) 258; GFX9-NEXT: .LBB1_4: ; %exit 259; GFX9-NEXT: s_waitcnt vmcnt(0) 260; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1] 261; GFX9-NEXT: s_movk_i32 s4, 0x8000 262; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 263; GFX9-NEXT: v_or_b32_e32 v2, 0xffff8000, v0 264; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1] 265; GFX9-NEXT: v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 266; GFX9-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 267; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 268; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 269; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 270; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 271; GFX9-NEXT: s_setpc_b64 s[30:31] 272 br i1 undef, label %T, label %F 273 274T: 275 %t = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p0 276 br label %exit 277 278F: 279 %f = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p1 280 br label %exit 281 282exit: 283 %m = phi <8 x i16> [ %t, %T ], [ %f, %F ] 284 %v2 = shufflevector <8 x i16> %m, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 285 %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1> 286 %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1> 287 ret <4 x i16> %r2 288} 289 290define <4 x half> @vec_8xf16_extract_4xf16(<8 x half> addrspace(1) * %p0, <8 x half> addrspace(1) * %p1) { 291; SI-LABEL: vec_8xf16_extract_4xf16: 292; SI: ; %bb.0: 293; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 294; SI-NEXT: s_cbranch_scc0 .LBB2_2 295; SI-NEXT: ; %bb.1: ; %F 296; SI-NEXT: s_mov_b32 s6, 0 297; SI-NEXT: s_mov_b32 s7, 0xf000 298; SI-NEXT: s_mov_b32 s4, s6 299; SI-NEXT: s_mov_b32 s5, s6 300; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc 301; SI-NEXT: s_waitcnt vmcnt(0) 302; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc 303; SI-NEXT: s_waitcnt vmcnt(0) 304; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc 305; SI-NEXT: s_waitcnt vmcnt(0) 306; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc 307; SI-NEXT: s_waitcnt vmcnt(0) 308; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc 309; SI-NEXT: s_waitcnt vmcnt(0) 310; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc 311; SI-NEXT: s_waitcnt vmcnt(0) 312; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc 313; SI-NEXT: s_waitcnt vmcnt(0) 314; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc 315; SI-NEXT: s_waitcnt vmcnt(0) 316; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 317; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 318; SI-NEXT: v_or_b32_e32 v2, v6, v2 319; SI-NEXT: v_or_b32_e32 v4, v4, v3 320; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 321; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 322; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 323; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 324; SI-NEXT: s_mov_b64 vcc, exec 325; SI-NEXT: s_cbranch_execz .LBB2_3 326; SI-NEXT: s_branch .LBB2_4 327; SI-NEXT: .LBB2_2: 328; SI-NEXT: ; implicit-def: $vgpr3 329; SI-NEXT: ; implicit-def: $vgpr4 330; SI-NEXT: ; implicit-def: $vgpr2 331; SI-NEXT: s_mov_b64 vcc, 0 332; SI-NEXT: .LBB2_3: ; %T 333; SI-NEXT: s_mov_b32 s6, 0 334; SI-NEXT: s_mov_b32 s7, 0xf000 335; SI-NEXT: s_mov_b32 s4, s6 336; SI-NEXT: s_mov_b32 s5, s6 337; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 338; SI-NEXT: s_waitcnt vmcnt(0) 339; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc 340; SI-NEXT: s_waitcnt vmcnt(0) 341; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc 342; SI-NEXT: s_waitcnt vmcnt(0) 343; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc 344; SI-NEXT: s_waitcnt vmcnt(0) 345; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc 346; SI-NEXT: s_waitcnt vmcnt(0) 347; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc 348; SI-NEXT: s_waitcnt vmcnt(0) 349; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc 350; SI-NEXT: s_waitcnt vmcnt(0) 351; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc 352; SI-NEXT: s_waitcnt vmcnt(0) 353; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 354; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 355; SI-NEXT: v_or_b32_e32 v0, v4, v0 356; SI-NEXT: v_or_b32_e32 v1, v2, v1 357; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 358; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 359; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 360; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 361; SI-NEXT: .LBB2_4: ; %exit 362; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 363; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 364; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 365; SI-NEXT: v_mov_b32_e32 v3, 0x3fa00000 366; SI-NEXT: v_mov_b32_e32 v4, 0x3f200000 367; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 368; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 369; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 370; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 371; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 372; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 373; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 374; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2 375; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 376; SI-NEXT: v_mov_b32_e32 v3, v2 377; SI-NEXT: s_setpc_b64 s[30:31] 378; 379; GFX9-LABEL: vec_8xf16_extract_4xf16: 380; GFX9: ; %bb.0: 381; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 382; GFX9-NEXT: s_cbranch_scc0 .LBB2_2 383; GFX9-NEXT: ; %bb.1: ; %F 384; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc 385; GFX9-NEXT: s_waitcnt vmcnt(0) 386; GFX9-NEXT: s_cbranch_execz .LBB2_3 387; GFX9-NEXT: s_branch .LBB2_4 388; GFX9-NEXT: .LBB2_2: 389; GFX9-NEXT: s_mov_b32 s8, 0 390; GFX9-NEXT: s_mov_b32 s9, s8 391; GFX9-NEXT: s_mov_b32 s10, s8 392; GFX9-NEXT: s_mov_b32 s11, s8 393; GFX9-NEXT: v_mov_b32_e32 v2, s8 394; GFX9-NEXT: v_mov_b32_e32 v3, s9 395; GFX9-NEXT: v_mov_b32_e32 v4, s10 396; GFX9-NEXT: v_mov_b32_e32 v5, s11 397; GFX9-NEXT: .LBB2_3: ; %T 398; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc 399; GFX9-NEXT: s_waitcnt vmcnt(0) 400; GFX9-NEXT: .LBB2_4: ; %exit 401; GFX9-NEXT: s_waitcnt vmcnt(0) 402; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 403; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 404; GFX9-NEXT: v_mov_b32_e32 v1, 0x3800 405; GFX9-NEXT: v_mov_b32_e32 v3, 0x3900 406; GFX9-NEXT: v_mov_b32_e32 v4, 0x3d00 407; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v0 408; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v4, vcc 409; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:DWORD 410; GFX9-NEXT: v_cndmask_b32_e32 v6, v4, v3, vcc 411; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v2 412; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 413; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v2, v1 src0_sel:WORD_1 src1_sel:DWORD 414; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 415; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 416; GFX9-NEXT: v_pack_b32_f16 v1, v5, v6 417; GFX9-NEXT: s_setpc_b64 s[30:31] 418 br i1 undef, label %T, label %F 419 420T: 421 %t = load volatile <8 x half>, <8 x half> addrspace(1) * %p0 422 br label %exit 423 424F: 425 %f = load volatile <8 x half>, <8 x half> addrspace(1) * %p1 426 br label %exit 427 428exit: 429 %m = phi <8 x half> [ %t, %T ], [ %f, %F ] 430 %v2 = shufflevector <8 x half> %m, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 431 %b2 = fcmp ugt <4 x half> %v2, <half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800> 432 %r2 = select <4 x i1> %b2, <4 x half> <half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900>, <4 x half> <half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00> 433 ret <4 x half> %r2 434} 435 436define <4 x i16> @vec_16xi16_extract_4xi16(<16 x i16> addrspace(1) * %p0, <16 x i16> addrspace(1) * %p1) { 437; 438; SI-LABEL: vec_16xi16_extract_4xi16: 439; SI: ; %bb.0: 440; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 441; SI-NEXT: s_cbranch_scc0 .LBB3_2 442; SI-NEXT: ; %bb.1: ; %F 443; SI-NEXT: s_mov_b32 s6, 0 444; SI-NEXT: s_mov_b32 s7, 0xf000 445; SI-NEXT: s_mov_b32 s4, s6 446; SI-NEXT: s_mov_b32 s5, s6 447; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc 448; SI-NEXT: s_waitcnt vmcnt(0) 449; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc 450; SI-NEXT: s_waitcnt vmcnt(0) 451; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc 452; SI-NEXT: s_waitcnt vmcnt(0) 453; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc 454; SI-NEXT: s_waitcnt vmcnt(0) 455; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc 456; SI-NEXT: s_waitcnt vmcnt(0) 457; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc 458; SI-NEXT: s_waitcnt vmcnt(0) 459; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc 460; SI-NEXT: s_waitcnt vmcnt(0) 461; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:14 glc 462; SI-NEXT: s_waitcnt vmcnt(0) 463; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc 464; SI-NEXT: s_waitcnt vmcnt(0) 465; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:18 glc 466; SI-NEXT: s_waitcnt vmcnt(0) 467; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:20 glc 468; SI-NEXT: s_waitcnt vmcnt(0) 469; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:22 glc 470; SI-NEXT: s_waitcnt vmcnt(0) 471; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:24 glc 472; SI-NEXT: s_waitcnt vmcnt(0) 473; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:26 glc 474; SI-NEXT: s_waitcnt vmcnt(0) 475; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:28 glc 476; SI-NEXT: s_waitcnt vmcnt(0) 477; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc 478; SI-NEXT: s_waitcnt vmcnt(0) 479; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 480; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 481; SI-NEXT: v_or_b32_e32 v2, v6, v2 482; SI-NEXT: v_or_b32_e32 v3, v4, v3 483; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 484; SI-NEXT: s_mov_b64 vcc, exec 485; SI-NEXT: s_cbranch_execz .LBB3_3 486; SI-NEXT: s_branch .LBB3_4 487; SI-NEXT: .LBB3_2: 488; SI-NEXT: ; implicit-def: $vgpr3 489; SI-NEXT: ; implicit-def: $vgpr4 490; SI-NEXT: ; implicit-def: $vgpr2 491; SI-NEXT: s_mov_b64 vcc, 0 492; SI-NEXT: .LBB3_3: ; %T 493; SI-NEXT: s_mov_b32 s6, 0 494; SI-NEXT: s_mov_b32 s7, 0xf000 495; SI-NEXT: s_mov_b32 s4, s6 496; SI-NEXT: s_mov_b32 s5, s6 497; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc 498; SI-NEXT: s_waitcnt vmcnt(0) 499; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc 500; SI-NEXT: s_waitcnt vmcnt(0) 501; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc 502; SI-NEXT: s_waitcnt vmcnt(0) 503; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc 504; SI-NEXT: s_waitcnt vmcnt(0) 505; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc 506; SI-NEXT: s_waitcnt vmcnt(0) 507; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc 508; SI-NEXT: s_waitcnt vmcnt(0) 509; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc 510; SI-NEXT: s_waitcnt vmcnt(0) 511; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:14 glc 512; SI-NEXT: s_waitcnt vmcnt(0) 513; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc 514; SI-NEXT: s_waitcnt vmcnt(0) 515; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc 516; SI-NEXT: s_waitcnt vmcnt(0) 517; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc 518; SI-NEXT: s_waitcnt vmcnt(0) 519; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc 520; SI-NEXT: s_waitcnt vmcnt(0) 521; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc 522; SI-NEXT: s_waitcnt vmcnt(0) 523; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc 524; SI-NEXT: s_waitcnt vmcnt(0) 525; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc 526; SI-NEXT: s_waitcnt vmcnt(0) 527; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc 528; SI-NEXT: s_waitcnt vmcnt(0) 529; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 530; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 531; SI-NEXT: v_or_b32_e32 v2, v4, v0 532; SI-NEXT: v_or_b32_e32 v3, v3, v1 533; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 534; SI-NEXT: .LBB3_4: ; %exit 535; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 536; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 537; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 538; SI-NEXT: v_mov_b32_e32 v3, 0xffff 539; SI-NEXT: v_mov_b32_e32 v4, 0x8000 540; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000 541; SI-NEXT: v_bfrev_b32_e32 v6, 1 542; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000 543; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 544; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 545; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 546; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc 547; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 548; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc 549; SI-NEXT: v_or_b32_e32 v0, v0, v1 550; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 551; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 552; SI-NEXT: v_or_b32_e32 v2, v2, v3 553; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 554; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 555; SI-NEXT: s_setpc_b64 s[30:31] 556; 557; GFX9-LABEL: vec_16xi16_extract_4xi16: 558; GFX9: ; %bb.0: 559; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 560; GFX9-NEXT: s_cbranch_scc0 .LBB3_2 561; GFX9-NEXT: ; %bb.1: ; %F 562; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc 563; GFX9-NEXT: s_waitcnt vmcnt(0) 564; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc 565; GFX9-NEXT: s_waitcnt vmcnt(0) 566; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 567; GFX9-NEXT: s_cbranch_execz .LBB3_3 568; GFX9-NEXT: s_branch .LBB3_4 569; GFX9-NEXT: .LBB3_2: 570; GFX9-NEXT: s_mov_b32 s8, 0 571; GFX9-NEXT: s_mov_b32 s9, s8 572; GFX9-NEXT: s_mov_b32 s10, s8 573; GFX9-NEXT: s_mov_b32 s11, s8 574; GFX9-NEXT: s_mov_b32 s12, s8 575; GFX9-NEXT: s_mov_b32 s13, s8 576; GFX9-NEXT: s_mov_b32 s14, s8 577; GFX9-NEXT: s_mov_b32 s15, s8 578; GFX9-NEXT: v_mov_b32_e32 v4, s8 579; GFX9-NEXT: v_mov_b32_e32 v5, s9 580; GFX9-NEXT: v_mov_b32_e32 v6, s10 581; GFX9-NEXT: v_mov_b32_e32 v7, s11 582; GFX9-NEXT: v_mov_b32_e32 v8, s12 583; GFX9-NEXT: v_mov_b32_e32 v9, s13 584; GFX9-NEXT: v_mov_b32_e32 v10, s14 585; GFX9-NEXT: v_mov_b32_e32 v11, s15 586; GFX9-NEXT: .LBB3_3: ; %T 587; GFX9-NEXT: s_waitcnt vmcnt(0) 588; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc 589; GFX9-NEXT: s_waitcnt vmcnt(0) 590; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc 591; GFX9-NEXT: s_waitcnt vmcnt(0) 592; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 593; GFX9-NEXT: .LBB3_4: ; %exit 594; GFX9-NEXT: s_waitcnt vmcnt(0) 595; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,0] 596; GFX9-NEXT: s_movk_i32 s4, 0x8000 597; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 598; GFX9-NEXT: v_or_b32_e32 v2, 0xffff8000, v0 599; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1] 600; GFX9-NEXT: v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 601; GFX9-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 602; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 603; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 604; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 605; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 606; GFX9-NEXT: s_setpc_b64 s[30:31] 607 br i1 undef, label %T, label %F 608 609T: 610 %t = load volatile <16 x i16>, <16 x i16> addrspace(1) * %p0 611 br label %exit 612 613F: 614 %f = load volatile <16 x i16>, <16 x i16> addrspace(1) * %p1 615 br label %exit 616 617exit: 618 %m = phi <16 x i16> [ %t, %T ], [ %f, %F ] 619 %v2 = shufflevector <16 x i16> %m, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 620 %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1> 621 %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1> 622 ret <4 x i16> %r2 623} 624 625define <4 x i16> @vec_16xi16_extract_4xi16_2(<16 x i16> addrspace(1) * %p0, <16 x i16> addrspace(1) * %p1) { 626; 627; SI-LABEL: vec_16xi16_extract_4xi16_2: 628; SI: ; %bb.0: 629; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 630; SI-NEXT: s_cbranch_scc0 .LBB4_2 631; SI-NEXT: ; %bb.1: ; %F 632; SI-NEXT: s_mov_b32 s6, 0 633; SI-NEXT: s_mov_b32 s7, 0xf000 634; SI-NEXT: s_mov_b32 s4, s6 635; SI-NEXT: s_mov_b32 s5, s6 636; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc 637; SI-NEXT: s_waitcnt vmcnt(0) 638; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc 639; SI-NEXT: s_waitcnt vmcnt(0) 640; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:4 glc 641; SI-NEXT: s_waitcnt vmcnt(0) 642; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc 643; SI-NEXT: s_waitcnt vmcnt(0) 644; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc 645; SI-NEXT: s_waitcnt vmcnt(0) 646; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc 647; SI-NEXT: s_waitcnt vmcnt(0) 648; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc 649; SI-NEXT: s_waitcnt vmcnt(0) 650; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:14 glc 651; SI-NEXT: s_waitcnt vmcnt(0) 652; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc 653; SI-NEXT: s_waitcnt vmcnt(0) 654; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:18 glc 655; SI-NEXT: s_waitcnt vmcnt(0) 656; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:20 glc 657; SI-NEXT: s_waitcnt vmcnt(0) 658; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:22 glc 659; SI-NEXT: s_waitcnt vmcnt(0) 660; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:24 glc 661; SI-NEXT: s_waitcnt vmcnt(0) 662; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:26 glc 663; SI-NEXT: s_waitcnt vmcnt(0) 664; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:28 glc 665; SI-NEXT: s_waitcnt vmcnt(0) 666; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc 667; SI-NEXT: s_waitcnt vmcnt(0) 668; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 669; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 670; SI-NEXT: v_or_b32_e32 v2, v6, v2 671; SI-NEXT: v_or_b32_e32 v3, v4, v3 672; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 673; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 674; SI-NEXT: s_mov_b64 vcc, exec 675; SI-NEXT: s_cbranch_execz .LBB4_3 676; SI-NEXT: s_branch .LBB4_4 677; SI-NEXT: .LBB4_2: 678; SI-NEXT: ; implicit-def: $vgpr3 679; SI-NEXT: ; implicit-def: $vgpr5 680; SI-NEXT: ; implicit-def: $vgpr2 681; SI-NEXT: ; implicit-def: $vgpr4 682; SI-NEXT: s_mov_b64 vcc, 0 683; SI-NEXT: .LBB4_3: ; %T 684; SI-NEXT: s_mov_b32 s6, 0 685; SI-NEXT: s_mov_b32 s7, 0xf000 686; SI-NEXT: s_mov_b32 s4, s6 687; SI-NEXT: s_mov_b32 s5, s6 688; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 689; SI-NEXT: s_waitcnt vmcnt(0) 690; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc 691; SI-NEXT: s_waitcnt vmcnt(0) 692; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc 693; SI-NEXT: s_waitcnt vmcnt(0) 694; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc 695; SI-NEXT: s_waitcnt vmcnt(0) 696; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc 697; SI-NEXT: s_waitcnt vmcnt(0) 698; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:10 glc 699; SI-NEXT: s_waitcnt vmcnt(0) 700; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc 701; SI-NEXT: s_waitcnt vmcnt(0) 702; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:14 glc 703; SI-NEXT: s_waitcnt vmcnt(0) 704; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc 705; SI-NEXT: s_waitcnt vmcnt(0) 706; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc 707; SI-NEXT: s_waitcnt vmcnt(0) 708; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc 709; SI-NEXT: s_waitcnt vmcnt(0) 710; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc 711; SI-NEXT: s_waitcnt vmcnt(0) 712; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc 713; SI-NEXT: s_waitcnt vmcnt(0) 714; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc 715; SI-NEXT: s_waitcnt vmcnt(0) 716; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc 717; SI-NEXT: s_waitcnt vmcnt(0) 718; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc 719; SI-NEXT: s_waitcnt vmcnt(0) 720; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 721; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 722; SI-NEXT: v_or_b32_e32 v2, v4, v0 723; SI-NEXT: v_or_b32_e32 v3, v3, v1 724; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 725; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 726; SI-NEXT: .LBB4_4: ; %exit 727; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 728; SI-NEXT: v_bfe_i32 v1, v5, 0, 16 729; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 730; SI-NEXT: v_bfe_i32 v3, v4, 0, 16 731; SI-NEXT: v_mov_b32_e32 v4, 0xffff 732; SI-NEXT: v_mov_b32_e32 v5, 0x8000 733; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000 734; SI-NEXT: v_bfrev_b32_e32 v7, 1 735; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 736; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 737; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 738; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc 739; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 740; SI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc 741; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3 742; SI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc 743; SI-NEXT: v_or_b32_e32 v0, v0, v1 744; SI-NEXT: v_or_b32_e32 v2, v2, v3 745; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 746; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 747; SI-NEXT: s_setpc_b64 s[30:31] 748; 749; GFX9-LABEL: vec_16xi16_extract_4xi16_2: 750; GFX9: ; %bb.0: 751; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 752; GFX9-NEXT: s_cbranch_scc0 .LBB4_2 753; GFX9-NEXT: ; %bb.1: ; %F 754; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc 755; GFX9-NEXT: s_waitcnt vmcnt(0) 756; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc 757; GFX9-NEXT: s_waitcnt vmcnt(0) 758; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 759; GFX9-NEXT: s_cbranch_execz .LBB4_3 760; GFX9-NEXT: s_branch .LBB4_4 761; GFX9-NEXT: .LBB4_2: 762; GFX9-NEXT: s_mov_b32 s8, 0 763; GFX9-NEXT: s_mov_b32 s9, s8 764; GFX9-NEXT: s_mov_b32 s10, s8 765; GFX9-NEXT: s_mov_b32 s11, s8 766; GFX9-NEXT: s_mov_b32 s12, s8 767; GFX9-NEXT: s_mov_b32 s13, s8 768; GFX9-NEXT: s_mov_b32 s14, s8 769; GFX9-NEXT: s_mov_b32 s15, s8 770; GFX9-NEXT: v_mov_b32_e32 v4, s8 771; GFX9-NEXT: v_mov_b32_e32 v5, s9 772; GFX9-NEXT: v_mov_b32_e32 v6, s10 773; GFX9-NEXT: v_mov_b32_e32 v7, s11 774; GFX9-NEXT: v_mov_b32_e32 v8, s12 775; GFX9-NEXT: v_mov_b32_e32 v9, s13 776; GFX9-NEXT: v_mov_b32_e32 v10, s14 777; GFX9-NEXT: v_mov_b32_e32 v11, s15 778; GFX9-NEXT: .LBB4_3: ; %T 779; GFX9-NEXT: s_waitcnt vmcnt(0) 780; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc 781; GFX9-NEXT: s_waitcnt vmcnt(0) 782; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc 783; GFX9-NEXT: s_waitcnt vmcnt(0) 784; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 785; GFX9-NEXT: .LBB4_4: ; %exit 786; GFX9-NEXT: s_waitcnt vmcnt(0) 787; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v7 op_sel_hi:[0,1] 788; GFX9-NEXT: s_movk_i32 s4, 0x8000 789; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 790; GFX9-NEXT: v_or_b32_e32 v2, 0xffff8000, v0 791; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v6 op_sel_hi:[0,1] 792; GFX9-NEXT: v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 793; GFX9-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 794; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 795; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 796; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 797; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 798; GFX9-NEXT: s_setpc_b64 s[30:31] 799 br i1 undef, label %T, label %F 800 801T: 802 %t = load volatile <16 x i16>, <16 x i16> addrspace(1) * %p0 803 br label %exit 804 805F: 806 %f = load volatile <16 x i16>, <16 x i16> addrspace(1) * %p1 807 br label %exit 808 809exit: 810 %m = phi <16 x i16> [ %t, %T ], [ %f, %F ] 811 %v2 = shufflevector <16 x i16> %m, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 812 %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1> 813 %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1> 814 ret <4 x i16> %r2 815} 816 817define <4 x half> @vec_16xf16_extract_4xf16(<16 x half> addrspace(1) * %p0, <16 x half> addrspace(1) * %p1) { 818; 819; SI-LABEL: vec_16xf16_extract_4xf16: 820; SI: ; %bb.0: 821; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 822; SI-NEXT: s_cbranch_scc0 .LBB5_2 823; SI-NEXT: ; %bb.1: ; %F 824; SI-NEXT: s_mov_b32 s6, 0 825; SI-NEXT: s_mov_b32 s7, 0xf000 826; SI-NEXT: s_mov_b32 s4, s6 827; SI-NEXT: s_mov_b32 s5, s6 828; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc 829; SI-NEXT: s_waitcnt vmcnt(0) 830; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc 831; SI-NEXT: s_waitcnt vmcnt(0) 832; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc 833; SI-NEXT: s_waitcnt vmcnt(0) 834; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc 835; SI-NEXT: s_waitcnt vmcnt(0) 836; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc 837; SI-NEXT: s_waitcnt vmcnt(0) 838; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc 839; SI-NEXT: s_waitcnt vmcnt(0) 840; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc 841; SI-NEXT: s_waitcnt vmcnt(0) 842; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:14 glc 843; SI-NEXT: s_waitcnt vmcnt(0) 844; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc 845; SI-NEXT: s_waitcnt vmcnt(0) 846; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:18 glc 847; SI-NEXT: s_waitcnt vmcnt(0) 848; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:20 glc 849; SI-NEXT: s_waitcnt vmcnt(0) 850; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:22 glc 851; SI-NEXT: s_waitcnt vmcnt(0) 852; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:24 glc 853; SI-NEXT: s_waitcnt vmcnt(0) 854; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:26 glc 855; SI-NEXT: s_waitcnt vmcnt(0) 856; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:28 glc 857; SI-NEXT: s_waitcnt vmcnt(0) 858; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc 859; SI-NEXT: s_waitcnt vmcnt(0) 860; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 861; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 862; SI-NEXT: v_or_b32_e32 v2, v6, v2 863; SI-NEXT: v_or_b32_e32 v4, v4, v3 864; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 865; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 866; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 867; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 868; SI-NEXT: s_mov_b64 vcc, exec 869; SI-NEXT: s_cbranch_execz .LBB5_3 870; SI-NEXT: s_branch .LBB5_4 871; SI-NEXT: .LBB5_2: 872; SI-NEXT: ; implicit-def: $vgpr3 873; SI-NEXT: ; implicit-def: $vgpr4 874; SI-NEXT: ; implicit-def: $vgpr2 875; SI-NEXT: s_mov_b64 vcc, 0 876; SI-NEXT: .LBB5_3: ; %T 877; SI-NEXT: s_mov_b32 s6, 0 878; SI-NEXT: s_mov_b32 s7, 0xf000 879; SI-NEXT: s_mov_b32 s4, s6 880; SI-NEXT: s_mov_b32 s5, s6 881; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 882; SI-NEXT: s_waitcnt vmcnt(0) 883; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc 884; SI-NEXT: s_waitcnt vmcnt(0) 885; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc 886; SI-NEXT: s_waitcnt vmcnt(0) 887; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc 888; SI-NEXT: s_waitcnt vmcnt(0) 889; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc 890; SI-NEXT: s_waitcnt vmcnt(0) 891; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc 892; SI-NEXT: s_waitcnt vmcnt(0) 893; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc 894; SI-NEXT: s_waitcnt vmcnt(0) 895; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:14 glc 896; SI-NEXT: s_waitcnt vmcnt(0) 897; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc 898; SI-NEXT: s_waitcnt vmcnt(0) 899; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc 900; SI-NEXT: s_waitcnt vmcnt(0) 901; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc 902; SI-NEXT: s_waitcnt vmcnt(0) 903; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc 904; SI-NEXT: s_waitcnt vmcnt(0) 905; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc 906; SI-NEXT: s_waitcnt vmcnt(0) 907; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc 908; SI-NEXT: s_waitcnt vmcnt(0) 909; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc 910; SI-NEXT: s_waitcnt vmcnt(0) 911; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc 912; SI-NEXT: s_waitcnt vmcnt(0) 913; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 914; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 915; SI-NEXT: v_or_b32_e32 v0, v4, v0 916; SI-NEXT: v_or_b32_e32 v1, v2, v1 917; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 918; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 919; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 920; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 921; SI-NEXT: .LBB5_4: ; %exit 922; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 923; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 924; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 925; SI-NEXT: v_mov_b32_e32 v3, 0x3fa00000 926; SI-NEXT: v_mov_b32_e32 v4, 0x3f200000 927; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 928; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 929; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 930; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 931; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 932; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 933; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 934; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2 935; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 936; SI-NEXT: v_mov_b32_e32 v3, v2 937; SI-NEXT: s_setpc_b64 s[30:31] 938; 939; GFX9-LABEL: vec_16xf16_extract_4xf16: 940; GFX9: ; %bb.0: 941; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 942; GFX9-NEXT: s_cbranch_scc0 .LBB5_2 943; GFX9-NEXT: ; %bb.1: ; %F 944; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc 945; GFX9-NEXT: s_waitcnt vmcnt(0) 946; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc 947; GFX9-NEXT: s_waitcnt vmcnt(0) 948; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 949; GFX9-NEXT: s_cbranch_execz .LBB5_3 950; GFX9-NEXT: s_branch .LBB5_4 951; GFX9-NEXT: .LBB5_2: 952; GFX9-NEXT: s_mov_b32 s8, 0 953; GFX9-NEXT: s_mov_b32 s9, s8 954; GFX9-NEXT: s_mov_b32 s10, s8 955; GFX9-NEXT: s_mov_b32 s11, s8 956; GFX9-NEXT: s_mov_b32 s12, s8 957; GFX9-NEXT: s_mov_b32 s13, s8 958; GFX9-NEXT: s_mov_b32 s14, s8 959; GFX9-NEXT: s_mov_b32 s15, s8 960; GFX9-NEXT: v_mov_b32_e32 v4, s8 961; GFX9-NEXT: v_mov_b32_e32 v5, s9 962; GFX9-NEXT: v_mov_b32_e32 v6, s10 963; GFX9-NEXT: v_mov_b32_e32 v7, s11 964; GFX9-NEXT: v_mov_b32_e32 v8, s12 965; GFX9-NEXT: v_mov_b32_e32 v9, s13 966; GFX9-NEXT: v_mov_b32_e32 v10, s14 967; GFX9-NEXT: v_mov_b32_e32 v11, s15 968; GFX9-NEXT: .LBB5_3: ; %T 969; GFX9-NEXT: s_waitcnt vmcnt(0) 970; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc 971; GFX9-NEXT: s_waitcnt vmcnt(0) 972; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc 973; GFX9-NEXT: s_waitcnt vmcnt(0) 974; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 975; GFX9-NEXT: .LBB5_4: ; %exit 976; GFX9-NEXT: s_waitcnt vmcnt(0) 977; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 978; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 979; GFX9-NEXT: v_mov_b32_e32 v1, 0x3800 980; GFX9-NEXT: v_mov_b32_e32 v2, 0x3900 981; GFX9-NEXT: v_mov_b32_e32 v3, 0x3d00 982; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v0 983; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v3, vcc 984; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:DWORD 985; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v2, vcc 986; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v4 987; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 988; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v4, v1 src0_sel:WORD_1 src1_sel:DWORD 989; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc 990; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 991; GFX9-NEXT: v_pack_b32_f16 v1, v5, v6 992; GFX9-NEXT: s_setpc_b64 s[30:31] 993 br i1 undef, label %T, label %F 994 995T: 996 %t = load volatile <16 x half>, <16 x half> addrspace(1) * %p0 997 br label %exit 998 999F: 1000 %f = load volatile <16 x half>, <16 x half> addrspace(1) * %p1 1001 br label %exit 1002 1003exit: 1004 %m = phi <16 x half> [ %t, %T ], [ %f, %F ] 1005 %v2 = shufflevector <16 x half> %m, <16 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 1006 %b2 = fcmp ugt <4 x half> %v2, <half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800> 1007 %r2 = select <4 x i1> %b2, <4 x half> <half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900>, <4 x half> <half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00> 1008 ret <4 x half> %r2 1009} 1010