1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck -check-prefix=SI %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s 4 5define <4 x i16> @vec_8xi16_extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspace(1) * %p1) { 6; SI-LABEL: vec_8xi16_extract_4xi16: 7; SI: ; %bb.0: 8; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9; SI-NEXT: s_cbranch_scc0 .LBB0_2 10; SI-NEXT: ; %bb.1: ; %F 11; SI-NEXT: s_mov_b32 s6, 0 12; SI-NEXT: s_mov_b32 s7, 0xf000 13; SI-NEXT: s_mov_b32 s4, s6 14; SI-NEXT: s_mov_b32 s5, s6 15; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc 16; SI-NEXT: s_waitcnt vmcnt(0) 17; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc 18; SI-NEXT: s_waitcnt vmcnt(0) 19; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc 20; SI-NEXT: s_waitcnt vmcnt(0) 21; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc 22; SI-NEXT: s_waitcnt vmcnt(0) 23; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc 24; SI-NEXT: s_waitcnt vmcnt(0) 25; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc 26; SI-NEXT: s_waitcnt vmcnt(0) 27; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc 28; SI-NEXT: s_waitcnt vmcnt(0) 29; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc 30; SI-NEXT: s_waitcnt vmcnt(0) 31; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 32; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 33; SI-NEXT: v_or_b32_e32 v2, v6, v2 34; SI-NEXT: v_or_b32_e32 v3, v4, v3 35; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 36; SI-NEXT: s_mov_b64 vcc, exec 37; SI-NEXT: s_cbranch_execz .LBB0_3 38; SI-NEXT: s_branch .LBB0_4 39; SI-NEXT: .LBB0_2: 40; SI-NEXT: ; implicit-def: $vgpr3 41; SI-NEXT: ; implicit-def: $vgpr4 42; SI-NEXT: ; implicit-def: $vgpr2 43; SI-NEXT: s_mov_b64 vcc, 0 44; SI-NEXT: .LBB0_3: ; %T 45; SI-NEXT: s_mov_b32 s6, 0 46; SI-NEXT: s_mov_b32 s7, 0xf000 47; SI-NEXT: s_mov_b32 s4, s6 48; SI-NEXT: s_mov_b32 s5, s6 49; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc 50; SI-NEXT: s_waitcnt vmcnt(0) 51; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc 52; SI-NEXT: s_waitcnt vmcnt(0) 53; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc 54; SI-NEXT: s_waitcnt vmcnt(0) 55; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc 56; SI-NEXT: s_waitcnt vmcnt(0) 57; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc 58; SI-NEXT: s_waitcnt vmcnt(0) 59; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc 60; SI-NEXT: s_waitcnt vmcnt(0) 61; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc 62; SI-NEXT: s_waitcnt vmcnt(0) 63; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc 64; SI-NEXT: s_waitcnt vmcnt(0) 65; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 66; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 67; SI-NEXT: v_or_b32_e32 v2, v4, v0 68; SI-NEXT: v_or_b32_e32 v3, v3, v1 69; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 70; SI-NEXT: .LBB0_4: ; %exit 71; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 72; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 73; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 74; SI-NEXT: v_mov_b32_e32 v3, 0xffff 75; SI-NEXT: v_mov_b32_e32 v4, 0x8000 76; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000 77; SI-NEXT: v_bfrev_b32_e32 v6, 1 78; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000 79; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 80; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 81; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 82; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc 83; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 84; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc 85; SI-NEXT: v_or_b32_e32 v0, v0, v1 86; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 87; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 88; SI-NEXT: v_or_b32_e32 v2, v2, v3 89; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 90; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 91; SI-NEXT: s_setpc_b64 s[30:31] 92; 93; GFX9-LABEL: vec_8xi16_extract_4xi16: 94; GFX9: ; %bb.0: 95; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 96; GFX9-NEXT: s_cbranch_scc0 .LBB0_2 97; GFX9-NEXT: ; %bb.1: ; %F 98; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc 99; GFX9-NEXT: s_waitcnt vmcnt(0) 100; GFX9-NEXT: s_cbranch_execz .LBB0_3 101; GFX9-NEXT: s_branch .LBB0_4 102; GFX9-NEXT: .LBB0_2: 103; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 104; GFX9-NEXT: .LBB0_3: ; %T 105; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc 106; GFX9-NEXT: s_waitcnt vmcnt(0) 107; GFX9-NEXT: .LBB0_4: ; %exit 108; GFX9-NEXT: s_waitcnt vmcnt(0) 109; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0] 110; GFX9-NEXT: s_movk_i32 s4, 0x8000 111; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 112; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v0 113; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1] 114; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 115; GFX9-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 116; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 117; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 118; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 119; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 120; GFX9-NEXT: s_setpc_b64 s[30:31] 121 br i1 undef, label %T, label %F 122 123T: 124 %t = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p0 125 br label %exit 126 127F: 128 %f = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p1 129 br label %exit 130 131exit: 132 %m = phi <8 x i16> [ %t, %T ], [ %f, %F ] 133 %v2 = shufflevector <8 x i16> %m, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 134 %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1> 135 %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1> 136 ret <4 x i16> %r2 137} 138 139define <4 x i16> @vec_8xi16_extract_4xi16_2(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspace(1) * %p1) { 140; SI-LABEL: vec_8xi16_extract_4xi16_2: 141; SI: ; %bb.0: 142; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 143; SI-NEXT: s_cbranch_scc0 .LBB1_2 144; SI-NEXT: ; %bb.1: ; %F 145; SI-NEXT: s_mov_b32 s6, 0 146; SI-NEXT: s_mov_b32 s7, 0xf000 147; SI-NEXT: s_mov_b32 s4, s6 148; SI-NEXT: s_mov_b32 s5, s6 149; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc 150; SI-NEXT: s_waitcnt vmcnt(0) 151; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc 152; SI-NEXT: s_waitcnt vmcnt(0) 153; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:4 glc 154; SI-NEXT: s_waitcnt vmcnt(0) 155; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc 156; SI-NEXT: s_waitcnt vmcnt(0) 157; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc 158; SI-NEXT: s_waitcnt vmcnt(0) 159; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc 160; SI-NEXT: s_waitcnt vmcnt(0) 161; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc 162; SI-NEXT: s_waitcnt vmcnt(0) 163; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc 164; SI-NEXT: s_waitcnt vmcnt(0) 165; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 166; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 167; SI-NEXT: v_or_b32_e32 v2, v6, v2 168; SI-NEXT: v_or_b32_e32 v3, v4, v3 169; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 170; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 171; SI-NEXT: s_mov_b64 vcc, exec 172; SI-NEXT: s_cbranch_execz .LBB1_3 173; SI-NEXT: s_branch .LBB1_4 174; SI-NEXT: .LBB1_2: 175; SI-NEXT: ; implicit-def: $vgpr3 176; SI-NEXT: ; implicit-def: $vgpr5 177; SI-NEXT: ; implicit-def: $vgpr2 178; SI-NEXT: ; implicit-def: $vgpr4 179; SI-NEXT: s_mov_b64 vcc, 0 180; SI-NEXT: .LBB1_3: ; %T 181; SI-NEXT: s_mov_b32 s6, 0 182; SI-NEXT: s_mov_b32 s7, 0xf000 183; SI-NEXT: s_mov_b32 s4, s6 184; SI-NEXT: s_mov_b32 s5, s6 185; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 186; SI-NEXT: s_waitcnt vmcnt(0) 187; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc 188; SI-NEXT: s_waitcnt vmcnt(0) 189; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc 190; SI-NEXT: s_waitcnt vmcnt(0) 191; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc 192; SI-NEXT: s_waitcnt vmcnt(0) 193; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc 194; SI-NEXT: s_waitcnt vmcnt(0) 195; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:10 glc 196; SI-NEXT: s_waitcnt vmcnt(0) 197; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc 198; SI-NEXT: s_waitcnt vmcnt(0) 199; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc 200; SI-NEXT: s_waitcnt vmcnt(0) 201; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 202; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 203; SI-NEXT: v_or_b32_e32 v2, v4, v0 204; SI-NEXT: v_or_b32_e32 v3, v3, v1 205; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 206; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 207; SI-NEXT: .LBB1_4: ; %exit 208; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 209; SI-NEXT: v_bfe_i32 v1, v5, 0, 16 210; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 211; SI-NEXT: v_bfe_i32 v3, v4, 0, 16 212; SI-NEXT: v_mov_b32_e32 v4, 0xffff 213; SI-NEXT: v_mov_b32_e32 v5, 0x8000 214; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000 215; SI-NEXT: v_bfrev_b32_e32 v7, 1 216; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 217; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 218; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 219; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc 220; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 221; SI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc 222; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3 223; SI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc 224; SI-NEXT: v_or_b32_e32 v0, v0, v1 225; SI-NEXT: v_or_b32_e32 v2, v2, v3 226; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 227; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 228; SI-NEXT: s_setpc_b64 s[30:31] 229; 230; GFX9-LABEL: vec_8xi16_extract_4xi16_2: 231; GFX9: ; %bb.0: 232; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 233; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 234; GFX9-NEXT: ; %bb.1: ; %F 235; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc 236; GFX9-NEXT: s_waitcnt vmcnt(0) 237; GFX9-NEXT: s_cbranch_execz .LBB1_3 238; GFX9-NEXT: s_branch .LBB1_4 239; GFX9-NEXT: .LBB1_2: 240; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 241; GFX9-NEXT: .LBB1_3: ; %T 242; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc 243; GFX9-NEXT: s_waitcnt vmcnt(0) 244; GFX9-NEXT: .LBB1_4: ; %exit 245; GFX9-NEXT: s_waitcnt vmcnt(0) 246; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1] 247; GFX9-NEXT: s_movk_i32 s4, 0x8000 248; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 249; GFX9-NEXT: v_or_b32_e32 v2, 0xffff8000, v0 250; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1] 251; GFX9-NEXT: v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 252; GFX9-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 253; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 254; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 255; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 256; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 257; GFX9-NEXT: s_setpc_b64 s[30:31] 258 br i1 undef, label %T, label %F 259 260T: 261 %t = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p0 262 br label %exit 263 264F: 265 %f = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p1 266 br label %exit 267 268exit: 269 %m = phi <8 x i16> [ %t, %T ], [ %f, %F ] 270 %v2 = shufflevector <8 x i16> %m, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 271 %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1> 272 %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1> 273 ret <4 x i16> %r2 274} 275 276define <4 x half> @vec_8xf16_extract_4xf16(<8 x half> addrspace(1) * %p0, <8 x half> addrspace(1) * %p1) { 277; SI-LABEL: vec_8xf16_extract_4xf16: 278; SI: ; %bb.0: 279; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 280; SI-NEXT: s_cbranch_scc0 .LBB2_2 281; SI-NEXT: ; %bb.1: ; %F 282; SI-NEXT: s_mov_b32 s6, 0 283; SI-NEXT: s_mov_b32 s7, 0xf000 284; SI-NEXT: s_mov_b32 s4, s6 285; SI-NEXT: s_mov_b32 s5, s6 286; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc 287; SI-NEXT: s_waitcnt vmcnt(0) 288; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc 289; SI-NEXT: s_waitcnt vmcnt(0) 290; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc 291; SI-NEXT: s_waitcnt vmcnt(0) 292; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc 293; SI-NEXT: s_waitcnt vmcnt(0) 294; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc 295; SI-NEXT: s_waitcnt vmcnt(0) 296; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc 297; SI-NEXT: s_waitcnt vmcnt(0) 298; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc 299; SI-NEXT: s_waitcnt vmcnt(0) 300; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc 301; SI-NEXT: s_waitcnt vmcnt(0) 302; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 303; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 304; SI-NEXT: v_or_b32_e32 v2, v6, v2 305; SI-NEXT: v_or_b32_e32 v4, v4, v3 306; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 307; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 308; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 309; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 310; SI-NEXT: s_mov_b64 vcc, exec 311; SI-NEXT: s_cbranch_execz .LBB2_3 312; SI-NEXT: s_branch .LBB2_4 313; SI-NEXT: .LBB2_2: 314; SI-NEXT: ; implicit-def: $vgpr3 315; SI-NEXT: ; implicit-def: $vgpr4 316; SI-NEXT: ; implicit-def: $vgpr2 317; SI-NEXT: s_mov_b64 vcc, 0 318; SI-NEXT: .LBB2_3: ; %T 319; SI-NEXT: s_mov_b32 s6, 0 320; SI-NEXT: s_mov_b32 s7, 0xf000 321; SI-NEXT: s_mov_b32 s4, s6 322; SI-NEXT: s_mov_b32 s5, s6 323; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 324; SI-NEXT: s_waitcnt vmcnt(0) 325; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc 326; SI-NEXT: s_waitcnt vmcnt(0) 327; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc 328; SI-NEXT: s_waitcnt vmcnt(0) 329; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc 330; SI-NEXT: s_waitcnt vmcnt(0) 331; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc 332; SI-NEXT: s_waitcnt vmcnt(0) 333; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc 334; SI-NEXT: s_waitcnt vmcnt(0) 335; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc 336; SI-NEXT: s_waitcnt vmcnt(0) 337; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc 338; SI-NEXT: s_waitcnt vmcnt(0) 339; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 340; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 341; SI-NEXT: v_or_b32_e32 v0, v4, v0 342; SI-NEXT: v_or_b32_e32 v1, v2, v1 343; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 344; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 345; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 346; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 347; SI-NEXT: .LBB2_4: ; %exit 348; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 349; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 350; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 351; SI-NEXT: v_mov_b32_e32 v3, 0x3fa00000 352; SI-NEXT: v_mov_b32_e32 v4, 0x3f200000 353; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 354; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 355; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 356; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 357; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 358; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 359; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 360; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2 361; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 362; SI-NEXT: v_mov_b32_e32 v3, v2 363; SI-NEXT: s_setpc_b64 s[30:31] 364; 365; GFX9-LABEL: vec_8xf16_extract_4xf16: 366; GFX9: ; %bb.0: 367; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 368; GFX9-NEXT: s_cbranch_scc0 .LBB2_2 369; GFX9-NEXT: ; %bb.1: ; %F 370; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc 371; GFX9-NEXT: s_waitcnt vmcnt(0) 372; GFX9-NEXT: s_cbranch_execz .LBB2_3 373; GFX9-NEXT: s_branch .LBB2_4 374; GFX9-NEXT: .LBB2_2: 375; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 376; GFX9-NEXT: .LBB2_3: ; %T 377; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc 378; GFX9-NEXT: s_waitcnt vmcnt(0) 379; GFX9-NEXT: .LBB2_4: ; %exit 380; GFX9-NEXT: s_waitcnt vmcnt(0) 381; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 382; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 383; GFX9-NEXT: v_mov_b32_e32 v1, 0x3800 384; GFX9-NEXT: v_mov_b32_e32 v3, 0x3900 385; GFX9-NEXT: v_mov_b32_e32 v4, 0x3d00 386; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v0 387; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v4, vcc 388; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:DWORD 389; GFX9-NEXT: v_cndmask_b32_e32 v6, v4, v3, vcc 390; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v2 391; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 392; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v2, v1 src0_sel:WORD_1 src1_sel:DWORD 393; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 394; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 395; GFX9-NEXT: v_pack_b32_f16 v1, v5, v6 396; GFX9-NEXT: s_setpc_b64 s[30:31] 397 br i1 undef, label %T, label %F 398 399T: 400 %t = load volatile <8 x half>, <8 x half> addrspace(1) * %p0 401 br label %exit 402 403F: 404 %f = load volatile <8 x half>, <8 x half> addrspace(1) * %p1 405 br label %exit 406 407exit: 408 %m = phi <8 x half> [ %t, %T ], [ %f, %F ] 409 %v2 = shufflevector <8 x half> %m, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 410 %b2 = fcmp ugt <4 x half> %v2, <half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800> 411 %r2 = select <4 x i1> %b2, <4 x half> <half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900>, <4 x half> <half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00> 412 ret <4 x half> %r2 413} 414 415define <4 x i16> @vec_16xi16_extract_4xi16(<16 x i16> addrspace(1) * %p0, <16 x i16> addrspace(1) * %p1) { 416; 417; SI-LABEL: vec_16xi16_extract_4xi16: 418; SI: ; %bb.0: 419; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 420; SI-NEXT: s_cbranch_scc0 .LBB3_2 421; SI-NEXT: ; %bb.1: ; %F 422; SI-NEXT: s_mov_b32 s6, 0 423; SI-NEXT: s_mov_b32 s7, 0xf000 424; SI-NEXT: s_mov_b32 s4, s6 425; SI-NEXT: s_mov_b32 s5, s6 426; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc 427; SI-NEXT: s_waitcnt vmcnt(0) 428; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc 429; SI-NEXT: s_waitcnt vmcnt(0) 430; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc 431; SI-NEXT: s_waitcnt vmcnt(0) 432; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc 433; SI-NEXT: s_waitcnt vmcnt(0) 434; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc 435; SI-NEXT: s_waitcnt vmcnt(0) 436; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc 437; SI-NEXT: s_waitcnt vmcnt(0) 438; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc 439; SI-NEXT: s_waitcnt vmcnt(0) 440; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:14 glc 441; SI-NEXT: s_waitcnt vmcnt(0) 442; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc 443; SI-NEXT: s_waitcnt vmcnt(0) 444; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:18 glc 445; SI-NEXT: s_waitcnt vmcnt(0) 446; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:20 glc 447; SI-NEXT: s_waitcnt vmcnt(0) 448; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:22 glc 449; SI-NEXT: s_waitcnt vmcnt(0) 450; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:24 glc 451; SI-NEXT: s_waitcnt vmcnt(0) 452; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:26 glc 453; SI-NEXT: s_waitcnt vmcnt(0) 454; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:28 glc 455; SI-NEXT: s_waitcnt vmcnt(0) 456; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc 457; SI-NEXT: s_waitcnt vmcnt(0) 458; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 459; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 460; SI-NEXT: v_or_b32_e32 v2, v6, v2 461; SI-NEXT: v_or_b32_e32 v3, v4, v3 462; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 463; SI-NEXT: s_mov_b64 vcc, exec 464; SI-NEXT: s_cbranch_execz .LBB3_3 465; SI-NEXT: s_branch .LBB3_4 466; SI-NEXT: .LBB3_2: 467; SI-NEXT: ; implicit-def: $vgpr3 468; SI-NEXT: ; implicit-def: $vgpr4 469; SI-NEXT: ; implicit-def: $vgpr2 470; SI-NEXT: s_mov_b64 vcc, 0 471; SI-NEXT: .LBB3_3: ; %T 472; SI-NEXT: s_mov_b32 s6, 0 473; SI-NEXT: s_mov_b32 s7, 0xf000 474; SI-NEXT: s_mov_b32 s4, s6 475; SI-NEXT: s_mov_b32 s5, s6 476; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc 477; SI-NEXT: s_waitcnt vmcnt(0) 478; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc 479; SI-NEXT: s_waitcnt vmcnt(0) 480; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc 481; SI-NEXT: s_waitcnt vmcnt(0) 482; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc 483; SI-NEXT: s_waitcnt vmcnt(0) 484; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc 485; SI-NEXT: s_waitcnt vmcnt(0) 486; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc 487; SI-NEXT: s_waitcnt vmcnt(0) 488; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc 489; SI-NEXT: s_waitcnt vmcnt(0) 490; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:14 glc 491; SI-NEXT: s_waitcnt vmcnt(0) 492; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc 493; SI-NEXT: s_waitcnt vmcnt(0) 494; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc 495; SI-NEXT: s_waitcnt vmcnt(0) 496; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc 497; SI-NEXT: s_waitcnt vmcnt(0) 498; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc 499; SI-NEXT: s_waitcnt vmcnt(0) 500; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc 501; SI-NEXT: s_waitcnt vmcnt(0) 502; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc 503; SI-NEXT: s_waitcnt vmcnt(0) 504; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc 505; SI-NEXT: s_waitcnt vmcnt(0) 506; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc 507; SI-NEXT: s_waitcnt vmcnt(0) 508; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 509; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 510; SI-NEXT: v_or_b32_e32 v2, v4, v0 511; SI-NEXT: v_or_b32_e32 v3, v3, v1 512; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 513; SI-NEXT: .LBB3_4: ; %exit 514; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 515; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 516; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 517; SI-NEXT: v_mov_b32_e32 v3, 0xffff 518; SI-NEXT: v_mov_b32_e32 v4, 0x8000 519; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000 520; SI-NEXT: v_bfrev_b32_e32 v6, 1 521; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000 522; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 523; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 524; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 525; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc 526; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 527; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc 528; SI-NEXT: v_or_b32_e32 v0, v0, v1 529; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 530; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 531; SI-NEXT: v_or_b32_e32 v2, v2, v3 532; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 533; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 534; SI-NEXT: s_setpc_b64 s[30:31] 535; 536; GFX9-LABEL: vec_16xi16_extract_4xi16: 537; GFX9: ; %bb.0: 538; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 539; GFX9-NEXT: s_cbranch_scc0 .LBB3_2 540; GFX9-NEXT: ; %bb.1: ; %F 541; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc 542; GFX9-NEXT: s_waitcnt vmcnt(0) 543; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc 544; GFX9-NEXT: s_waitcnt vmcnt(0) 545; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 546; GFX9-NEXT: s_cbranch_execz .LBB3_3 547; GFX9-NEXT: s_branch .LBB3_4 548; GFX9-NEXT: .LBB3_2: 549; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 550; GFX9-NEXT: .LBB3_3: ; %T 551; GFX9-NEXT: s_waitcnt vmcnt(0) 552; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc 553; GFX9-NEXT: s_waitcnt vmcnt(0) 554; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc 555; GFX9-NEXT: s_waitcnt vmcnt(0) 556; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 557; GFX9-NEXT: .LBB3_4: ; %exit 558; GFX9-NEXT: s_waitcnt vmcnt(0) 559; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,0] 560; GFX9-NEXT: s_movk_i32 s4, 0x8000 561; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 562; GFX9-NEXT: v_or_b32_e32 v2, 0xffff8000, v0 563; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1] 564; GFX9-NEXT: v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 565; GFX9-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 566; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 567; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 568; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 569; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 570; GFX9-NEXT: s_setpc_b64 s[30:31] 571 br i1 undef, label %T, label %F 572 573T: 574 %t = load volatile <16 x i16>, <16 x i16> addrspace(1) * %p0 575 br label %exit 576 577F: 578 %f = load volatile <16 x i16>, <16 x i16> addrspace(1) * %p1 579 br label %exit 580 581exit: 582 %m = phi <16 x i16> [ %t, %T ], [ %f, %F ] 583 %v2 = shufflevector <16 x i16> %m, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 584 %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1> 585 %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1> 586 ret <4 x i16> %r2 587} 588 589define <4 x i16> @vec_16xi16_extract_4xi16_2(<16 x i16> addrspace(1) * %p0, <16 x i16> addrspace(1) * %p1) { 590; 591; SI-LABEL: vec_16xi16_extract_4xi16_2: 592; SI: ; %bb.0: 593; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 594; SI-NEXT: s_cbranch_scc0 .LBB4_2 595; SI-NEXT: ; %bb.1: ; %F 596; SI-NEXT: s_mov_b32 s6, 0 597; SI-NEXT: s_mov_b32 s7, 0xf000 598; SI-NEXT: s_mov_b32 s4, s6 599; SI-NEXT: s_mov_b32 s5, s6 600; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc 601; SI-NEXT: s_waitcnt vmcnt(0) 602; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc 603; SI-NEXT: s_waitcnt vmcnt(0) 604; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:4 glc 605; SI-NEXT: s_waitcnt vmcnt(0) 606; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc 607; SI-NEXT: s_waitcnt vmcnt(0) 608; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc 609; SI-NEXT: s_waitcnt vmcnt(0) 610; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc 611; SI-NEXT: s_waitcnt vmcnt(0) 612; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc 613; SI-NEXT: s_waitcnt vmcnt(0) 614; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:14 glc 615; SI-NEXT: s_waitcnt vmcnt(0) 616; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc 617; SI-NEXT: s_waitcnt vmcnt(0) 618; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:18 glc 619; SI-NEXT: s_waitcnt vmcnt(0) 620; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:20 glc 621; SI-NEXT: s_waitcnt vmcnt(0) 622; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:22 glc 623; SI-NEXT: s_waitcnt vmcnt(0) 624; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:24 glc 625; SI-NEXT: s_waitcnt vmcnt(0) 626; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:26 glc 627; SI-NEXT: s_waitcnt vmcnt(0) 628; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:28 glc 629; SI-NEXT: s_waitcnt vmcnt(0) 630; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc 631; SI-NEXT: s_waitcnt vmcnt(0) 632; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 633; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 634; SI-NEXT: v_or_b32_e32 v2, v6, v2 635; SI-NEXT: v_or_b32_e32 v3, v4, v3 636; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 637; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 638; SI-NEXT: s_mov_b64 vcc, exec 639; SI-NEXT: s_cbranch_execz .LBB4_3 640; SI-NEXT: s_branch .LBB4_4 641; SI-NEXT: .LBB4_2: 642; SI-NEXT: ; implicit-def: $vgpr3 643; SI-NEXT: ; implicit-def: $vgpr5 644; SI-NEXT: ; implicit-def: $vgpr2 645; SI-NEXT: ; implicit-def: $vgpr4 646; SI-NEXT: s_mov_b64 vcc, 0 647; SI-NEXT: .LBB4_3: ; %T 648; SI-NEXT: s_mov_b32 s6, 0 649; SI-NEXT: s_mov_b32 s7, 0xf000 650; SI-NEXT: s_mov_b32 s4, s6 651; SI-NEXT: s_mov_b32 s5, s6 652; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 653; SI-NEXT: s_waitcnt vmcnt(0) 654; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc 655; SI-NEXT: s_waitcnt vmcnt(0) 656; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc 657; SI-NEXT: s_waitcnt vmcnt(0) 658; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc 659; SI-NEXT: s_waitcnt vmcnt(0) 660; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc 661; SI-NEXT: s_waitcnt vmcnt(0) 662; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:10 glc 663; SI-NEXT: s_waitcnt vmcnt(0) 664; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc 665; SI-NEXT: s_waitcnt vmcnt(0) 666; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:14 glc 667; SI-NEXT: s_waitcnt vmcnt(0) 668; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc 669; SI-NEXT: s_waitcnt vmcnt(0) 670; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc 671; SI-NEXT: s_waitcnt vmcnt(0) 672; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc 673; SI-NEXT: s_waitcnt vmcnt(0) 674; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc 675; SI-NEXT: s_waitcnt vmcnt(0) 676; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc 677; SI-NEXT: s_waitcnt vmcnt(0) 678; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc 679; SI-NEXT: s_waitcnt vmcnt(0) 680; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc 681; SI-NEXT: s_waitcnt vmcnt(0) 682; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc 683; SI-NEXT: s_waitcnt vmcnt(0) 684; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 685; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 686; SI-NEXT: v_or_b32_e32 v2, v4, v0 687; SI-NEXT: v_or_b32_e32 v3, v3, v1 688; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 689; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 690; SI-NEXT: .LBB4_4: ; %exit 691; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 692; SI-NEXT: v_bfe_i32 v1, v5, 0, 16 693; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 694; SI-NEXT: v_bfe_i32 v3, v4, 0, 16 695; SI-NEXT: v_mov_b32_e32 v4, 0xffff 696; SI-NEXT: v_mov_b32_e32 v5, 0x8000 697; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000 698; SI-NEXT: v_bfrev_b32_e32 v7, 1 699; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 700; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 701; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 702; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc 703; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 704; SI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc 705; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3 706; SI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc 707; SI-NEXT: v_or_b32_e32 v0, v0, v1 708; SI-NEXT: v_or_b32_e32 v2, v2, v3 709; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 710; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 711; SI-NEXT: s_setpc_b64 s[30:31] 712; 713; GFX9-LABEL: vec_16xi16_extract_4xi16_2: 714; GFX9: ; %bb.0: 715; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 716; GFX9-NEXT: s_cbranch_scc0 .LBB4_2 717; GFX9-NEXT: ; %bb.1: ; %F 718; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc 719; GFX9-NEXT: s_waitcnt vmcnt(0) 720; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc 721; GFX9-NEXT: s_waitcnt vmcnt(0) 722; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 723; GFX9-NEXT: s_cbranch_execz .LBB4_3 724; GFX9-NEXT: s_branch .LBB4_4 725; GFX9-NEXT: .LBB4_2: 726; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 727; GFX9-NEXT: .LBB4_3: ; %T 728; GFX9-NEXT: s_waitcnt vmcnt(0) 729; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc 730; GFX9-NEXT: s_waitcnt vmcnt(0) 731; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc 732; GFX9-NEXT: s_waitcnt vmcnt(0) 733; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 734; GFX9-NEXT: .LBB4_4: ; %exit 735; GFX9-NEXT: s_waitcnt vmcnt(0) 736; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v7 op_sel_hi:[0,1] 737; GFX9-NEXT: s_movk_i32 s4, 0x8000 738; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 739; GFX9-NEXT: v_or_b32_e32 v2, 0xffff8000, v0 740; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v6 op_sel_hi:[0,1] 741; GFX9-NEXT: v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 742; GFX9-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 743; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 744; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 745; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 746; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 747; GFX9-NEXT: s_setpc_b64 s[30:31] 748 br i1 undef, label %T, label %F 749 750T: 751 %t = load volatile <16 x i16>, <16 x i16> addrspace(1) * %p0 752 br label %exit 753 754F: 755 %f = load volatile <16 x i16>, <16 x i16> addrspace(1) * %p1 756 br label %exit 757 758exit: 759 %m = phi <16 x i16> [ %t, %T ], [ %f, %F ] 760 %v2 = shufflevector <16 x i16> %m, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 761 %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1> 762 %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1> 763 ret <4 x i16> %r2 764} 765 766define <4 x half> @vec_16xf16_extract_4xf16(<16 x half> addrspace(1) * %p0, <16 x half> addrspace(1) * %p1) { 767; 768; SI-LABEL: vec_16xf16_extract_4xf16: 769; SI: ; %bb.0: 770; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 771; SI-NEXT: s_cbranch_scc0 .LBB5_2 772; SI-NEXT: ; %bb.1: ; %F 773; SI-NEXT: s_mov_b32 s6, 0 774; SI-NEXT: s_mov_b32 s7, 0xf000 775; SI-NEXT: s_mov_b32 s4, s6 776; SI-NEXT: s_mov_b32 s5, s6 777; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc 778; SI-NEXT: s_waitcnt vmcnt(0) 779; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc 780; SI-NEXT: s_waitcnt vmcnt(0) 781; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc 782; SI-NEXT: s_waitcnt vmcnt(0) 783; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc 784; SI-NEXT: s_waitcnt vmcnt(0) 785; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc 786; SI-NEXT: s_waitcnt vmcnt(0) 787; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc 788; SI-NEXT: s_waitcnt vmcnt(0) 789; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc 790; SI-NEXT: s_waitcnt vmcnt(0) 791; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:14 glc 792; SI-NEXT: s_waitcnt vmcnt(0) 793; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc 794; SI-NEXT: s_waitcnt vmcnt(0) 795; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:18 glc 796; SI-NEXT: s_waitcnt vmcnt(0) 797; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:20 glc 798; SI-NEXT: s_waitcnt vmcnt(0) 799; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:22 glc 800; SI-NEXT: s_waitcnt vmcnt(0) 801; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:24 glc 802; SI-NEXT: s_waitcnt vmcnt(0) 803; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:26 glc 804; SI-NEXT: s_waitcnt vmcnt(0) 805; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:28 glc 806; SI-NEXT: s_waitcnt vmcnt(0) 807; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc 808; SI-NEXT: s_waitcnt vmcnt(0) 809; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 810; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 811; SI-NEXT: v_or_b32_e32 v2, v6, v2 812; SI-NEXT: v_or_b32_e32 v4, v4, v3 813; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 814; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 815; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 816; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 817; SI-NEXT: s_mov_b64 vcc, exec 818; SI-NEXT: s_cbranch_execz .LBB5_3 819; SI-NEXT: s_branch .LBB5_4 820; SI-NEXT: .LBB5_2: 821; SI-NEXT: ; implicit-def: $vgpr3 822; SI-NEXT: ; implicit-def: $vgpr4 823; SI-NEXT: ; implicit-def: $vgpr2 824; SI-NEXT: s_mov_b64 vcc, 0 825; SI-NEXT: .LBB5_3: ; %T 826; SI-NEXT: s_mov_b32 s6, 0 827; SI-NEXT: s_mov_b32 s7, 0xf000 828; SI-NEXT: s_mov_b32 s4, s6 829; SI-NEXT: s_mov_b32 s5, s6 830; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 831; SI-NEXT: s_waitcnt vmcnt(0) 832; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc 833; SI-NEXT: s_waitcnt vmcnt(0) 834; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc 835; SI-NEXT: s_waitcnt vmcnt(0) 836; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc 837; SI-NEXT: s_waitcnt vmcnt(0) 838; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc 839; SI-NEXT: s_waitcnt vmcnt(0) 840; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc 841; SI-NEXT: s_waitcnt vmcnt(0) 842; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc 843; SI-NEXT: s_waitcnt vmcnt(0) 844; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:14 glc 845; SI-NEXT: s_waitcnt vmcnt(0) 846; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc 847; SI-NEXT: s_waitcnt vmcnt(0) 848; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc 849; SI-NEXT: s_waitcnt vmcnt(0) 850; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc 851; SI-NEXT: s_waitcnt vmcnt(0) 852; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc 853; SI-NEXT: s_waitcnt vmcnt(0) 854; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc 855; SI-NEXT: s_waitcnt vmcnt(0) 856; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc 857; SI-NEXT: s_waitcnt vmcnt(0) 858; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc 859; SI-NEXT: s_waitcnt vmcnt(0) 860; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc 861; SI-NEXT: s_waitcnt vmcnt(0) 862; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 863; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 864; SI-NEXT: v_or_b32_e32 v0, v4, v0 865; SI-NEXT: v_or_b32_e32 v1, v2, v1 866; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 867; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 868; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 869; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 870; SI-NEXT: .LBB5_4: ; %exit 871; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 872; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 873; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 874; SI-NEXT: v_mov_b32_e32 v3, 0x3fa00000 875; SI-NEXT: v_mov_b32_e32 v4, 0x3f200000 876; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 877; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 878; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 879; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 880; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 881; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 882; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 883; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2 884; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 885; SI-NEXT: v_mov_b32_e32 v3, v2 886; SI-NEXT: s_setpc_b64 s[30:31] 887; 888; GFX9-LABEL: vec_16xf16_extract_4xf16: 889; GFX9: ; %bb.0: 890; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 891; GFX9-NEXT: s_cbranch_scc0 .LBB5_2 892; GFX9-NEXT: ; %bb.1: ; %F 893; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc 894; GFX9-NEXT: s_waitcnt vmcnt(0) 895; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc 896; GFX9-NEXT: s_waitcnt vmcnt(0) 897; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 898; GFX9-NEXT: s_cbranch_execz .LBB5_3 899; GFX9-NEXT: s_branch .LBB5_4 900; GFX9-NEXT: .LBB5_2: 901; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 902; GFX9-NEXT: .LBB5_3: ; %T 903; GFX9-NEXT: s_waitcnt vmcnt(0) 904; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc 905; GFX9-NEXT: s_waitcnt vmcnt(0) 906; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc 907; GFX9-NEXT: s_waitcnt vmcnt(0) 908; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 909; GFX9-NEXT: .LBB5_4: ; %exit 910; GFX9-NEXT: s_waitcnt vmcnt(0) 911; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 912; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 913; GFX9-NEXT: v_mov_b32_e32 v1, 0x3800 914; GFX9-NEXT: v_mov_b32_e32 v2, 0x3900 915; GFX9-NEXT: v_mov_b32_e32 v3, 0x3d00 916; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v0 917; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v3, vcc 918; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:DWORD 919; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v2, vcc 920; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v4 921; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 922; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v4, v1 src0_sel:WORD_1 src1_sel:DWORD 923; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc 924; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 925; GFX9-NEXT: v_pack_b32_f16 v1, v5, v6 926; GFX9-NEXT: s_setpc_b64 s[30:31] 927 br i1 undef, label %T, label %F 928 929T: 930 %t = load volatile <16 x half>, <16 x half> addrspace(1) * %p0 931 br label %exit 932 933F: 934 %f = load volatile <16 x half>, <16 x half> addrspace(1) * %p1 935 br label %exit 936 937exit: 938 %m = phi <16 x half> [ %t, %T ], [ %f, %F ] 939 %v2 = shufflevector <16 x half> %m, <16 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 940 %b2 = fcmp ugt <4 x half> %v2, <half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800> 941 %r2 = select <4 x i1> %b2, <4 x half> <half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900>, <4 x half> <half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00> 942 ret <4 x half> %r2 943} 944