1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck -check-prefix=SI %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s 4 5define <4 x i16> @extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspace(1) * %p1) { 6; SI-LABEL: extract_4xi16: 7; SI: ; %bb.0: 8; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9; SI-NEXT: s_cbranch_scc0 .LBB0_2 10; SI-NEXT: ; %bb.1: ; %F 11; SI-NEXT: s_mov_b32 s6, 0 12; SI-NEXT: s_mov_b32 s7, 0xf000 13; SI-NEXT: s_mov_b32 s4, s6 14; SI-NEXT: s_mov_b32 s5, s6 15; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc 16; SI-NEXT: s_waitcnt vmcnt(0) 17; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc 18; SI-NEXT: s_waitcnt vmcnt(0) 19; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc 20; SI-NEXT: s_waitcnt vmcnt(0) 21; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc 22; SI-NEXT: s_waitcnt vmcnt(0) 23; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc 24; SI-NEXT: s_waitcnt vmcnt(0) 25; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc 26; SI-NEXT: s_waitcnt vmcnt(0) 27; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc 28; SI-NEXT: s_waitcnt vmcnt(0) 29; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc 30; SI-NEXT: s_waitcnt vmcnt(0) 31; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 32; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 33; SI-NEXT: v_or_b32_e32 v2, v6, v2 34; SI-NEXT: v_or_b32_e32 v3, v4, v3 35; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 36; SI-NEXT: s_mov_b64 vcc, exec 37; SI-NEXT: s_cbranch_execz .LBB0_3 38; SI-NEXT: s_branch .LBB0_4 39; SI-NEXT: .LBB0_2: 40; SI-NEXT: ; implicit-def: $vgpr3 41; SI-NEXT: ; implicit-def: $vgpr4 42; SI-NEXT: ; implicit-def: $vgpr2 43; SI-NEXT: s_mov_b64 vcc, 0 44; SI-NEXT: .LBB0_3: ; %T 45; SI-NEXT: s_mov_b32 s6, 0 46; SI-NEXT: s_mov_b32 s7, 0xf000 47; SI-NEXT: s_mov_b32 s4, s6 48; SI-NEXT: s_mov_b32 s5, s6 49; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc 50; SI-NEXT: s_waitcnt vmcnt(0) 51; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc 52; SI-NEXT: s_waitcnt vmcnt(0) 53; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc 54; SI-NEXT: s_waitcnt vmcnt(0) 55; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc 56; SI-NEXT: s_waitcnt vmcnt(0) 57; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc 58; SI-NEXT: s_waitcnt vmcnt(0) 59; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc 60; SI-NEXT: s_waitcnt vmcnt(0) 61; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc 62; SI-NEXT: s_waitcnt vmcnt(0) 63; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc 64; SI-NEXT: s_waitcnt vmcnt(0) 65; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 66; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 67; SI-NEXT: v_or_b32_e32 v2, v4, v0 68; SI-NEXT: v_or_b32_e32 v3, v3, v1 69; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 70; SI-NEXT: .LBB0_4: ; %exit 71; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 72; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 73; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 74; SI-NEXT: s_mov_b32 s4, 0xffff 75; SI-NEXT: v_mov_b32_e32 v3, 0x8000 76; SI-NEXT: v_mov_b32_e32 v4, 0xffff0000 77; SI-NEXT: v_bfrev_b32_e32 v5, 1 78; SI-NEXT: v_mov_b32_e32 v6, 0xffff8000 79; SI-NEXT: v_mov_b32_e32 v7, s4 80; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 81; SI-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc 82; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 83; SI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc 84; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 85; SI-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc 86; SI-NEXT: v_or_b32_e32 v0, v0, v1 87; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 88; SI-NEXT: v_and_b32_e32 v2, s4, v2 89; SI-NEXT: v_or_b32_e32 v2, v2, v3 90; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 91; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 92; SI-NEXT: s_setpc_b64 s[30:31] 93; 94; GFX9-LABEL: extract_4xi16: 95; GFX9: ; %bb.0: 96; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 97; GFX9-NEXT: s_cbranch_scc0 .LBB0_2 98; GFX9-NEXT: ; %bb.1: ; %F 99; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc 100; GFX9-NEXT: s_waitcnt vmcnt(0) 101; GFX9-NEXT: s_cbranch_execz .LBB0_3 102; GFX9-NEXT: s_branch .LBB0_4 103; GFX9-NEXT: .LBB0_2: 104; GFX9-NEXT: s_mov_b32 s8, 0 105; GFX9-NEXT: s_mov_b32 s9, s8 106; GFX9-NEXT: s_mov_b32 s10, s8 107; GFX9-NEXT: s_mov_b32 s11, s8 108; GFX9-NEXT: v_mov_b32_e32 v2, s8 109; GFX9-NEXT: v_mov_b32_e32 v3, s9 110; GFX9-NEXT: v_mov_b32_e32 v4, s10 111; GFX9-NEXT: v_mov_b32_e32 v5, s11 112; GFX9-NEXT: .LBB0_3: ; %T 113; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc 114; GFX9-NEXT: s_waitcnt vmcnt(0) 115; GFX9-NEXT: .LBB0_4: ; %exit 116; GFX9-NEXT: s_waitcnt vmcnt(0) 117; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0] 118; GFX9-NEXT: s_movk_i32 s4, 0x8000 119; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 120; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 121; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1] 122; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 123; GFX9-NEXT: v_or_b32_e32 v0, s4, v0 124; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff 125; GFX9-NEXT: v_and_b32_e32 v0, v4, v0 126; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 127; GFX9-NEXT: v_and_b32_e32 v2, v4, v3 128; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 129; GFX9-NEXT: s_setpc_b64 s[30:31] 130 br i1 undef, label %T, label %F 131 132T: 133 %t = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p0 134 br label %exit 135 136F: 137 %f = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p1 138 br label %exit 139 140exit: 141 %m = phi <8 x i16> [ %t, %T ], [ %f, %F ] 142 %v2 = shufflevector <8 x i16> %m, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 143 %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1> 144 %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1> 145 ret <4 x i16> %r2 146} 147 148define <4 x i16> @extract_4xi16_2(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspace(1) * %p1) { 149; SI-LABEL: extract_4xi16_2: 150; SI: ; %bb.0: 151; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 152; SI-NEXT: s_cbranch_scc0 .LBB1_2 153; SI-NEXT: ; %bb.1: ; %F 154; SI-NEXT: s_mov_b32 s6, 0 155; SI-NEXT: s_mov_b32 s7, 0xf000 156; SI-NEXT: s_mov_b32 s4, s6 157; SI-NEXT: s_mov_b32 s5, s6 158; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc 159; SI-NEXT: s_waitcnt vmcnt(0) 160; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc 161; SI-NEXT: s_waitcnt vmcnt(0) 162; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:4 glc 163; SI-NEXT: s_waitcnt vmcnt(0) 164; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc 165; SI-NEXT: s_waitcnt vmcnt(0) 166; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc 167; SI-NEXT: s_waitcnt vmcnt(0) 168; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc 169; SI-NEXT: s_waitcnt vmcnt(0) 170; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc 171; SI-NEXT: s_waitcnt vmcnt(0) 172; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc 173; SI-NEXT: s_waitcnt vmcnt(0) 174; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 175; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 176; SI-NEXT: v_or_b32_e32 v2, v6, v2 177; SI-NEXT: v_or_b32_e32 v3, v4, v3 178; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 179; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 180; SI-NEXT: s_mov_b64 vcc, exec 181; SI-NEXT: s_cbranch_execz .LBB1_3 182; SI-NEXT: s_branch .LBB1_4 183; SI-NEXT: .LBB1_2: 184; SI-NEXT: ; implicit-def: $vgpr3 185; SI-NEXT: ; implicit-def: $vgpr5 186; SI-NEXT: ; implicit-def: $vgpr2 187; SI-NEXT: ; implicit-def: $vgpr4 188; SI-NEXT: s_mov_b64 vcc, 0 189; SI-NEXT: .LBB1_3: ; %T 190; SI-NEXT: s_mov_b32 s6, 0 191; SI-NEXT: s_mov_b32 s7, 0xf000 192; SI-NEXT: s_mov_b32 s4, s6 193; SI-NEXT: s_mov_b32 s5, s6 194; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 195; SI-NEXT: s_waitcnt vmcnt(0) 196; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc 197; SI-NEXT: s_waitcnt vmcnt(0) 198; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc 199; SI-NEXT: s_waitcnt vmcnt(0) 200; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc 201; SI-NEXT: s_waitcnt vmcnt(0) 202; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc 203; SI-NEXT: s_waitcnt vmcnt(0) 204; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:10 glc 205; SI-NEXT: s_waitcnt vmcnt(0) 206; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc 207; SI-NEXT: s_waitcnt vmcnt(0) 208; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc 209; SI-NEXT: s_waitcnt vmcnt(0) 210; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 211; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 212; SI-NEXT: v_or_b32_e32 v2, v4, v0 213; SI-NEXT: v_or_b32_e32 v3, v3, v1 214; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 215; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 216; SI-NEXT: .LBB1_4: ; %exit 217; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 218; SI-NEXT: v_bfe_i32 v1, v5, 0, 16 219; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 220; SI-NEXT: v_bfe_i32 v3, v4, 0, 16 221; SI-NEXT: v_mov_b32_e32 v4, 0xffff 222; SI-NEXT: v_mov_b32_e32 v5, 0x8000 223; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000 224; SI-NEXT: v_bfrev_b32_e32 v7, 1 225; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 226; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 227; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 228; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc 229; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 230; SI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc 231; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3 232; SI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc 233; SI-NEXT: v_or_b32_e32 v0, v0, v1 234; SI-NEXT: v_or_b32_e32 v2, v2, v3 235; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 236; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 237; SI-NEXT: s_setpc_b64 s[30:31] 238; 239; GFX9-LABEL: extract_4xi16_2: 240; GFX9: ; %bb.0: 241; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 242; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 243; GFX9-NEXT: ; %bb.1: ; %F 244; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc 245; GFX9-NEXT: s_waitcnt vmcnt(0) 246; GFX9-NEXT: s_cbranch_execz .LBB1_3 247; GFX9-NEXT: s_branch .LBB1_4 248; GFX9-NEXT: .LBB1_2: 249; GFX9-NEXT: s_mov_b32 s8, 0 250; GFX9-NEXT: s_mov_b32 s9, s8 251; GFX9-NEXT: s_mov_b32 s10, s8 252; GFX9-NEXT: s_mov_b32 s11, s8 253; GFX9-NEXT: v_mov_b32_e32 v2, s8 254; GFX9-NEXT: v_mov_b32_e32 v3, s9 255; GFX9-NEXT: v_mov_b32_e32 v4, s10 256; GFX9-NEXT: v_mov_b32_e32 v5, s11 257; GFX9-NEXT: .LBB1_3: ; %T 258; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc 259; GFX9-NEXT: s_waitcnt vmcnt(0) 260; GFX9-NEXT: .LBB1_4: ; %exit 261; GFX9-NEXT: s_waitcnt vmcnt(0) 262; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1] 263; GFX9-NEXT: s_movk_i32 s4, 0x8000 264; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 265; GFX9-NEXT: v_or_b32_e32 v2, s4, v0 266; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1] 267; GFX9-NEXT: v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 268; GFX9-NEXT: v_or_b32_e32 v0, s4, v0 269; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff 270; GFX9-NEXT: v_and_b32_e32 v0, v4, v0 271; GFX9-NEXT: v_and_b32_e32 v2, v4, v2 272; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 273; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 274; GFX9-NEXT: s_setpc_b64 s[30:31] 275 br i1 undef, label %T, label %F 276 277T: 278 %t = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p0 279 br label %exit 280 281F: 282 %f = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p1 283 br label %exit 284 285exit: 286 %m = phi <8 x i16> [ %t, %T ], [ %f, %F ] 287 %v2 = shufflevector <8 x i16> %m, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 288 %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1> 289 %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1> 290 ret <4 x i16> %r2 291} 292 293define <4 x half> @extract_4xf16(<8 x half> addrspace(1) * %p0, <8 x half> addrspace(1) * %p1) { 294; SI-LABEL: extract_4xf16: 295; SI: ; %bb.0: 296; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 297; SI-NEXT: s_cbranch_scc0 .LBB2_2 298; SI-NEXT: ; %bb.1: ; %F 299; SI-NEXT: s_mov_b32 s6, 0 300; SI-NEXT: s_mov_b32 s7, 0xf000 301; SI-NEXT: s_mov_b32 s4, s6 302; SI-NEXT: s_mov_b32 s5, s6 303; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc 304; SI-NEXT: s_waitcnt vmcnt(0) 305; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc 306; SI-NEXT: s_waitcnt vmcnt(0) 307; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc 308; SI-NEXT: s_waitcnt vmcnt(0) 309; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc 310; SI-NEXT: s_waitcnt vmcnt(0) 311; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc 312; SI-NEXT: s_waitcnt vmcnt(0) 313; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc 314; SI-NEXT: s_waitcnt vmcnt(0) 315; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc 316; SI-NEXT: s_waitcnt vmcnt(0) 317; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc 318; SI-NEXT: s_waitcnt vmcnt(0) 319; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 320; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 321; SI-NEXT: v_or_b32_e32 v2, v6, v2 322; SI-NEXT: v_or_b32_e32 v4, v4, v3 323; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 324; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 325; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 326; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 327; SI-NEXT: s_mov_b64 vcc, exec 328; SI-NEXT: s_cbranch_execz .LBB2_3 329; SI-NEXT: s_branch .LBB2_4 330; SI-NEXT: .LBB2_2: 331; SI-NEXT: ; implicit-def: $vgpr3 332; SI-NEXT: ; implicit-def: $vgpr4 333; SI-NEXT: ; implicit-def: $vgpr2 334; SI-NEXT: s_mov_b64 vcc, 0 335; SI-NEXT: .LBB2_3: ; %T 336; SI-NEXT: s_mov_b32 s6, 0 337; SI-NEXT: s_mov_b32 s7, 0xf000 338; SI-NEXT: s_mov_b32 s4, s6 339; SI-NEXT: s_mov_b32 s5, s6 340; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 341; SI-NEXT: s_waitcnt vmcnt(0) 342; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc 343; SI-NEXT: s_waitcnt vmcnt(0) 344; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc 345; SI-NEXT: s_waitcnt vmcnt(0) 346; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc 347; SI-NEXT: s_waitcnt vmcnt(0) 348; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc 349; SI-NEXT: s_waitcnt vmcnt(0) 350; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc 351; SI-NEXT: s_waitcnt vmcnt(0) 352; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc 353; SI-NEXT: s_waitcnt vmcnt(0) 354; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc 355; SI-NEXT: s_waitcnt vmcnt(0) 356; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 357; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 358; SI-NEXT: v_or_b32_e32 v0, v4, v0 359; SI-NEXT: v_or_b32_e32 v1, v2, v1 360; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 361; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 362; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 363; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 364; SI-NEXT: .LBB2_4: ; %exit 365; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 366; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 367; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 368; SI-NEXT: v_mov_b32_e32 v3, 0x3fa00000 369; SI-NEXT: v_mov_b32_e32 v4, 0x3f200000 370; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 371; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 372; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 373; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 374; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 375; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 376; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 377; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2 378; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 379; SI-NEXT: v_mov_b32_e32 v3, v2 380; SI-NEXT: s_setpc_b64 s[30:31] 381; 382; GFX9-LABEL: extract_4xf16: 383; GFX9: ; %bb.0: 384; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 385; GFX9-NEXT: s_cbranch_scc0 .LBB2_2 386; GFX9-NEXT: ; %bb.1: ; %F 387; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc 388; GFX9-NEXT: s_waitcnt vmcnt(0) 389; GFX9-NEXT: s_cbranch_execz .LBB2_3 390; GFX9-NEXT: s_branch .LBB2_4 391; GFX9-NEXT: .LBB2_2: 392; GFX9-NEXT: s_mov_b32 s8, 0 393; GFX9-NEXT: s_mov_b32 s9, s8 394; GFX9-NEXT: s_mov_b32 s10, s8 395; GFX9-NEXT: s_mov_b32 s11, s8 396; GFX9-NEXT: v_mov_b32_e32 v2, s8 397; GFX9-NEXT: v_mov_b32_e32 v3, s9 398; GFX9-NEXT: v_mov_b32_e32 v4, s10 399; GFX9-NEXT: v_mov_b32_e32 v5, s11 400; GFX9-NEXT: .LBB2_3: ; %T 401; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc 402; GFX9-NEXT: s_waitcnt vmcnt(0) 403; GFX9-NEXT: .LBB2_4: ; %exit 404; GFX9-NEXT: s_waitcnt vmcnt(0) 405; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 406; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 407; GFX9-NEXT: v_mov_b32_e32 v1, 0x3800 408; GFX9-NEXT: v_mov_b32_e32 v3, 0x3900 409; GFX9-NEXT: v_mov_b32_e32 v4, 0x3d00 410; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v0 411; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v4, vcc 412; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:DWORD 413; GFX9-NEXT: v_cndmask_b32_e32 v6, v4, v3, vcc 414; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v2 415; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 416; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v2, v1 src0_sel:WORD_1 src1_sel:DWORD 417; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 418; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 419; GFX9-NEXT: v_pack_b32_f16 v1, v5, v6 420; GFX9-NEXT: s_setpc_b64 s[30:31] 421 br i1 undef, label %T, label %F 422 423T: 424 %t = load volatile <8 x half>, <8 x half> addrspace(1) * %p0 425 br label %exit 426 427F: 428 %f = load volatile <8 x half>, <8 x half> addrspace(1) * %p1 429 br label %exit 430 431exit: 432 %m = phi <8 x half> [ %t, %T ], [ %f, %F ] 433 %v2 = shufflevector <8 x half> %m, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 434 %b2 = fcmp ugt <4 x half> %v2, <half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800> 435 %r2 = select <4 x i1> %b2, <4 x half> <half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900>, <4 x half> <half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00> 436 ret <4 x half> %r2 437} 438