1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s 5 6define <4 x half> @shuffle_v4f16_23uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 7; GFX9-LABEL: shuffle_v4f16_23uu: 8; GFX9: ; %bb.0: 9; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 11; GFX9-NEXT: s_waitcnt vmcnt(0) 12; GFX9-NEXT: s_setpc_b64 s[30:31] 13; 14; GFX10-LABEL: shuffle_v4f16_23uu: 15; GFX10: ; %bb.0: 16; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 18; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 19; GFX10-NEXT: s_waitcnt vmcnt(0) 20; GFX10-NEXT: s_setpc_b64 s[30:31] 21; 22; GFX11-LABEL: shuffle_v4f16_23uu: 23; GFX11: ; %bb.0: 24; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 26; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 27; GFX11-NEXT: s_waitcnt vmcnt(0) 28; GFX11-NEXT: s_setpc_b64 s[30:31] 29 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 30 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 31 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 32 ret <4 x half> %shuffle 33} 34 35define <4 x half> @shuffle_v4f16_234u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 36; GFX9-LABEL: shuffle_v4f16_234u: 37; GFX9: ; %bb.0: 38; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 39; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 40; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 41; GFX9-NEXT: s_waitcnt vmcnt(1) 42; GFX9-NEXT: v_mov_b32_e32 v0, v6 43; GFX9-NEXT: s_waitcnt vmcnt(0) 44; GFX9-NEXT: v_mov_b32_e32 v1, v4 45; GFX9-NEXT: s_setpc_b64 s[30:31] 46; 47; GFX10-LABEL: shuffle_v4f16_234u: 48; GFX10: ; %bb.0: 49; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 50; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 51; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 52; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 53; GFX10-NEXT: s_waitcnt vmcnt(1) 54; GFX10-NEXT: v_mov_b32_e32 v0, v6 55; GFX10-NEXT: s_waitcnt vmcnt(0) 56; GFX10-NEXT: v_mov_b32_e32 v1, v4 57; GFX10-NEXT: s_setpc_b64 s[30:31] 58; 59; GFX11-LABEL: shuffle_v4f16_234u: 60; GFX11: ; %bb.0: 61; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 62; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 63; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 64; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off 65; GFX11-NEXT: s_waitcnt vmcnt(0) 66; GFX11-NEXT: s_setpc_b64 s[30:31] 67 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 68 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 69 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef> 70 ret <4 x half> %shuffle 71} 72 73define <4 x half> @shuffle_v4f16_u1u3(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 74; GFX9-LABEL: shuffle_v4f16_u1u3: 75; GFX9: ; %bb.0: 76; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 77; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 78; GFX9-NEXT: s_waitcnt vmcnt(0) 79; GFX9-NEXT: s_setpc_b64 s[30:31] 80; 81; GFX10-LABEL: shuffle_v4f16_u1u3: 82; GFX10: ; %bb.0: 83; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 84; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 85; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 86; GFX10-NEXT: s_waitcnt vmcnt(0) 87; GFX10-NEXT: s_setpc_b64 s[30:31] 88; 89; GFX11-LABEL: shuffle_v4f16_u1u3: 90; GFX11: ; %bb.0: 91; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 92; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 93; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 94; GFX11-NEXT: s_waitcnt vmcnt(0) 95; GFX11-NEXT: s_setpc_b64 s[30:31] 96 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 97 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 98 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 1, i32 undef, i32 3> 99 ret <4 x half> %shuffle 100} 101 102define <4 x half> @shuffle_v4f16_u3u1(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 103; GFX9-LABEL: shuffle_v4f16_u3u1: 104; GFX9: ; %bb.0: 105; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 106; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 107; GFX9-NEXT: s_waitcnt vmcnt(0) 108; GFX9-NEXT: v_mov_b32_e32 v0, v2 109; GFX9-NEXT: s_setpc_b64 s[30:31] 110; 111; GFX10-LABEL: shuffle_v4f16_u3u1: 112; GFX10: ; %bb.0: 113; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 114; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 115; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 116; GFX10-NEXT: s_waitcnt vmcnt(0) 117; GFX10-NEXT: v_mov_b32_e32 v0, v2 118; GFX10-NEXT: s_setpc_b64 s[30:31] 119; 120; GFX11-LABEL: shuffle_v4f16_u3u1: 121; GFX11: ; %bb.0: 122; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 123; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 124; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off 125; GFX11-NEXT: s_waitcnt vmcnt(0) 126; GFX11-NEXT: v_mov_b32_e32 v0, v2 127; GFX11-NEXT: s_setpc_b64 s[30:31] 128 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 129 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 130 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 1> 131 ret <4 x half> %shuffle 132} 133 134define <4 x half> @shuffle_v4f16_u3uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 135; GFX9-LABEL: shuffle_v4f16_u3uu: 136; GFX9: ; %bb.0: 137; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 138; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 139; GFX9-NEXT: s_waitcnt vmcnt(0) 140; GFX9-NEXT: s_setpc_b64 s[30:31] 141; 142; GFX10-LABEL: shuffle_v4f16_u3uu: 143; GFX10: ; %bb.0: 144; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 145; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 146; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 147; GFX10-NEXT: s_waitcnt vmcnt(0) 148; GFX10-NEXT: s_setpc_b64 s[30:31] 149; 150; GFX11-LABEL: shuffle_v4f16_u3uu: 151; GFX11: ; %bb.0: 152; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 153; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 154; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 155; GFX11-NEXT: s_waitcnt vmcnt(0) 156; GFX11-NEXT: s_setpc_b64 s[30:31] 157 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 158 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 159 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef> 160 ret <4 x half> %shuffle 161} 162 163define <4 x half> @shuffle_v4f16_3u6u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 164; GFX9-LABEL: shuffle_v4f16_3u6u: 165; GFX9: ; %bb.0: 166; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 167; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 168; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 169; GFX9-NEXT: s_waitcnt vmcnt(1) 170; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 171; GFX9-NEXT: s_waitcnt vmcnt(0) 172; GFX9-NEXT: v_mov_b32_e32 v1, v4 173; GFX9-NEXT: s_setpc_b64 s[30:31] 174; 175; GFX10-LABEL: shuffle_v4f16_3u6u: 176; GFX10: ; %bb.0: 177; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 178; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 179; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 180; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 181; GFX10-NEXT: s_waitcnt vmcnt(1) 182; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v5 183; GFX10-NEXT: s_waitcnt vmcnt(0) 184; GFX10-NEXT: v_mov_b32_e32 v1, v4 185; GFX10-NEXT: s_setpc_b64 s[30:31] 186; 187; GFX11-LABEL: shuffle_v4f16_3u6u: 188; GFX11: ; %bb.0: 189; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 190; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 191; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 192; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 193; GFX11-NEXT: s_waitcnt vmcnt(1) 194; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 195; GFX11-NEXT: s_waitcnt vmcnt(0) 196; GFX11-NEXT: s_setpc_b64 s[30:31] 197 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 198 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 199 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 6, i32 undef> 200 ret <4 x half> %shuffle 201} 202 203define <4 x half> @shuffle_v4f16_3uu7(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 204; GFX9-LABEL: shuffle_v4f16_3uu7: 205; GFX9: ; %bb.0: 206; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 207; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 208; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 209; GFX9-NEXT: s_waitcnt vmcnt(1) 210; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 211; GFX9-NEXT: s_waitcnt vmcnt(0) 212; GFX9-NEXT: v_mov_b32_e32 v1, v4 213; GFX9-NEXT: s_setpc_b64 s[30:31] 214; 215; GFX10-LABEL: shuffle_v4f16_3uu7: 216; GFX10: ; %bb.0: 217; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 218; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 219; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 220; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 221; GFX10-NEXT: s_waitcnt vmcnt(1) 222; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v5 223; GFX10-NEXT: s_waitcnt vmcnt(0) 224; GFX10-NEXT: v_mov_b32_e32 v1, v4 225; GFX10-NEXT: s_setpc_b64 s[30:31] 226; 227; GFX11-LABEL: shuffle_v4f16_3uu7: 228; GFX11: ; %bb.0: 229; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 230; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 231; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 232; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 233; GFX11-NEXT: s_waitcnt vmcnt(1) 234; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 235; GFX11-NEXT: s_waitcnt vmcnt(0) 236; GFX11-NEXT: s_setpc_b64 s[30:31] 237 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 238 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 239 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 undef, i32 7> 240 ret <4 x half> %shuffle 241} 242 243define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 244; GFX9-LABEL: shuffle_v4f16_35u5: 245; GFX9: ; %bb.0: 246; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 247; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 248; GFX9-NEXT: global_load_dword v4, v[2:3], off 249; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 250; GFX9-NEXT: s_waitcnt vmcnt(1) 251; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 252; GFX9-NEXT: s_waitcnt vmcnt(0) 253; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v4 254; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 255; GFX9-NEXT: v_mov_b32_e32 v1, v4 256; GFX9-NEXT: s_setpc_b64 s[30:31] 257; 258; GFX10-LABEL: shuffle_v4f16_35u5: 259; GFX10: ; %bb.0: 260; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 261; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 262; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 263; GFX10-NEXT: global_load_dword v4, v[2:3], off 264; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 265; GFX10-NEXT: s_waitcnt vmcnt(1) 266; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 267; GFX10-NEXT: s_waitcnt vmcnt(0) 268; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v4 269; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 270; GFX10-NEXT: v_mov_b32_e32 v1, v4 271; GFX10-NEXT: s_setpc_b64 s[30:31] 272; 273; GFX11-LABEL: shuffle_v4f16_35u5: 274; GFX11: ; %bb.0: 275; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 276; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 277; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 278; GFX11-NEXT: global_load_b32 v1, v[2:3], off 279; GFX11-NEXT: s_waitcnt vmcnt(1) 280; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 281; GFX11-NEXT: s_waitcnt vmcnt(0) 282; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 283; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 284; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 285; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 286; GFX11-NEXT: s_setpc_b64 s[30:31] 287 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 288 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 289 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 undef, i32 5> 290 ret <4 x half> %shuffle 291} 292 293define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 294; GFX9-LABEL: shuffle_v4f16_357u: 295; GFX9: ; %bb.0: 296; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 297; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 298; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 299; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 300; GFX9-NEXT: s_waitcnt vmcnt(1) 301; GFX9-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 302; GFX9-NEXT: s_waitcnt vmcnt(0) 303; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4 304; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5 305; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 306; GFX9-NEXT: s_setpc_b64 s[30:31] 307; 308; GFX10-LABEL: shuffle_v4f16_357u: 309; GFX10: ; %bb.0: 310; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 311; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 312; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 313; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 314; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 315; GFX10-NEXT: s_waitcnt vmcnt(1) 316; GFX10-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 317; GFX10-NEXT: s_waitcnt vmcnt(0) 318; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v4 319; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 320; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v5 321; GFX10-NEXT: s_setpc_b64 s[30:31] 322; 323; GFX11-LABEL: shuffle_v4f16_357u: 324; GFX11: ; %bb.0: 325; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 326; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 327; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:4 328; GFX11-NEXT: global_load_b64 v[0:1], v[2:3], off 329; GFX11-NEXT: s_waitcnt vmcnt(1) 330; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 331; GFX11-NEXT: s_waitcnt vmcnt(0) 332; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 333; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 334; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 335; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 336; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v2 337; GFX11-NEXT: s_setpc_b64 s[30:31] 338 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 339 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 340 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 undef> 341 ret <4 x half> %shuffle 342} 343 344define <4 x half> @shuffle_v4f16_0101(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 345; GFX9-LABEL: shuffle_v4f16_0101: 346; GFX9: ; %bb.0: 347; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 348; GFX9-NEXT: global_load_dword v0, v[0:1], off 349; GFX9-NEXT: s_waitcnt vmcnt(0) 350; GFX9-NEXT: v_mov_b32_e32 v1, v0 351; GFX9-NEXT: s_setpc_b64 s[30:31] 352; 353; GFX10-LABEL: shuffle_v4f16_0101: 354; GFX10: ; %bb.0: 355; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 356; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 357; GFX10-NEXT: global_load_dword v0, v[0:1], off 358; GFX10-NEXT: s_waitcnt vmcnt(0) 359; GFX10-NEXT: v_mov_b32_e32 v1, v0 360; GFX10-NEXT: s_setpc_b64 s[30:31] 361; 362; GFX11-LABEL: shuffle_v4f16_0101: 363; GFX11: ; %bb.0: 364; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 365; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 366; GFX11-NEXT: global_load_b32 v0, v[0:1], off 367; GFX11-NEXT: s_waitcnt vmcnt(0) 368; GFX11-NEXT: v_mov_b32_e32 v1, v0 369; GFX11-NEXT: s_setpc_b64 s[30:31] 370 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 371 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 372 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 373 ret <4 x half> %shuffle 374} 375 376define <4 x half> @shuffle_v4f16_0123(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 377; GFX9-LABEL: shuffle_v4f16_0123: 378; GFX9: ; %bb.0: 379; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 380; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 381; GFX9-NEXT: s_waitcnt vmcnt(0) 382; GFX9-NEXT: s_setpc_b64 s[30:31] 383; 384; GFX10-LABEL: shuffle_v4f16_0123: 385; GFX10: ; %bb.0: 386; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 387; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 388; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 389; GFX10-NEXT: s_waitcnt vmcnt(0) 390; GFX10-NEXT: s_setpc_b64 s[30:31] 391; 392; GFX11-LABEL: shuffle_v4f16_0123: 393; GFX11: ; %bb.0: 394; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 395; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 396; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 397; GFX11-NEXT: s_waitcnt vmcnt(0) 398; GFX11-NEXT: s_setpc_b64 s[30:31] 399 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 400 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 401 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 402 ret <4 x half> %shuffle 403} 404 405define <4 x half> @shuffle_v4f16_0145(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 406; GFX9-LABEL: shuffle_v4f16_0145: 407; GFX9: ; %bb.0: 408; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 409; GFX9-NEXT: global_load_dword v4, v[0:1], off 410; GFX9-NEXT: global_load_dword v5, v[2:3], off 411; GFX9-NEXT: s_waitcnt vmcnt(1) 412; GFX9-NEXT: v_mov_b32_e32 v0, v4 413; GFX9-NEXT: s_waitcnt vmcnt(0) 414; GFX9-NEXT: v_mov_b32_e32 v1, v5 415; GFX9-NEXT: s_setpc_b64 s[30:31] 416; 417; GFX10-LABEL: shuffle_v4f16_0145: 418; GFX10: ; %bb.0: 419; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 420; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 421; GFX10-NEXT: global_load_dword v4, v[0:1], off 422; GFX10-NEXT: global_load_dword v5, v[2:3], off 423; GFX10-NEXT: s_waitcnt vmcnt(1) 424; GFX10-NEXT: v_mov_b32_e32 v0, v4 425; GFX10-NEXT: s_waitcnt vmcnt(0) 426; GFX10-NEXT: v_mov_b32_e32 v1, v5 427; GFX10-NEXT: s_setpc_b64 s[30:31] 428; 429; GFX11-LABEL: shuffle_v4f16_0145: 430; GFX11: ; %bb.0: 431; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 432; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 433; GFX11-NEXT: global_load_b32 v0, v[0:1], off 434; GFX11-NEXT: global_load_b32 v1, v[2:3], off 435; GFX11-NEXT: s_waitcnt vmcnt(0) 436; GFX11-NEXT: s_setpc_b64 s[30:31] 437 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 438 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 439 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 440 ret <4 x half> %shuffle 441} 442 443define <4 x half> @shuffle_v4f16_0167(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 444; GFX9-LABEL: shuffle_v4f16_0167: 445; GFX9: ; %bb.0: 446; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 447; GFX9-NEXT: global_load_dword v4, v[0:1], off 448; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 449; GFX9-NEXT: s_waitcnt vmcnt(1) 450; GFX9-NEXT: v_mov_b32_e32 v0, v4 451; GFX9-NEXT: s_waitcnt vmcnt(0) 452; GFX9-NEXT: v_mov_b32_e32 v1, v5 453; GFX9-NEXT: s_setpc_b64 s[30:31] 454; 455; GFX10-LABEL: shuffle_v4f16_0167: 456; GFX10: ; %bb.0: 457; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 458; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 459; GFX10-NEXT: global_load_dword v4, v[0:1], off 460; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 461; GFX10-NEXT: s_waitcnt vmcnt(1) 462; GFX10-NEXT: v_mov_b32_e32 v0, v4 463; GFX10-NEXT: s_waitcnt vmcnt(0) 464; GFX10-NEXT: v_mov_b32_e32 v1, v5 465; GFX10-NEXT: s_setpc_b64 s[30:31] 466; 467; GFX11-LABEL: shuffle_v4f16_0167: 468; GFX11: ; %bb.0: 469; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 470; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 471; GFX11-NEXT: global_load_b32 v0, v[0:1], off 472; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 473; GFX11-NEXT: s_waitcnt vmcnt(0) 474; GFX11-NEXT: s_setpc_b64 s[30:31] 475 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 476 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 477 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 478 ret <4 x half> %shuffle 479} 480 481define <4 x half> @shuffle_v4f16_2301(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 482; GFX9-LABEL: shuffle_v4f16_2301: 483; GFX9: ; %bb.0: 484; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 485; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 486; GFX9-NEXT: s_waitcnt vmcnt(0) 487; GFX9-NEXT: v_mov_b32_e32 v0, v2 488; GFX9-NEXT: s_setpc_b64 s[30:31] 489; 490; GFX10-LABEL: shuffle_v4f16_2301: 491; GFX10: ; %bb.0: 492; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 493; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 494; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 495; GFX10-NEXT: s_waitcnt vmcnt(0) 496; GFX10-NEXT: v_mov_b32_e32 v0, v2 497; GFX10-NEXT: s_setpc_b64 s[30:31] 498; 499; GFX11-LABEL: shuffle_v4f16_2301: 500; GFX11: ; %bb.0: 501; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 502; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 503; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off 504; GFX11-NEXT: s_waitcnt vmcnt(0) 505; GFX11-NEXT: v_mov_b32_e32 v0, v2 506; GFX11-NEXT: s_setpc_b64 s[30:31] 507 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 508 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 509 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 510 ret <4 x half> %shuffle 511} 512 513define <4 x half> @shuffle_v4f16_2323(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 514; GFX9-LABEL: shuffle_v4f16_2323: 515; GFX9: ; %bb.0: 516; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 517; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 518; GFX9-NEXT: s_waitcnt vmcnt(0) 519; GFX9-NEXT: v_mov_b32_e32 v1, v0 520; GFX9-NEXT: s_setpc_b64 s[30:31] 521; 522; GFX10-LABEL: shuffle_v4f16_2323: 523; GFX10: ; %bb.0: 524; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 525; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 526; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 527; GFX10-NEXT: s_waitcnt vmcnt(0) 528; GFX10-NEXT: v_mov_b32_e32 v1, v0 529; GFX10-NEXT: s_setpc_b64 s[30:31] 530; 531; GFX11-LABEL: shuffle_v4f16_2323: 532; GFX11: ; %bb.0: 533; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 534; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 535; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 536; GFX11-NEXT: s_waitcnt vmcnt(0) 537; GFX11-NEXT: v_mov_b32_e32 v1, v0 538; GFX11-NEXT: s_setpc_b64 s[30:31] 539 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 540 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 541 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 2, i32 3> 542 ret <4 x half> %shuffle 543} 544 545define <4 x half> @shuffle_v4f16_2345(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 546; GFX9-LABEL: shuffle_v4f16_2345: 547; GFX9: ; %bb.0: 548; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 549; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 550; GFX9-NEXT: global_load_dword v5, v[2:3], off 551; GFX9-NEXT: s_waitcnt vmcnt(1) 552; GFX9-NEXT: v_mov_b32_e32 v0, v4 553; GFX9-NEXT: s_waitcnt vmcnt(0) 554; GFX9-NEXT: v_mov_b32_e32 v1, v5 555; GFX9-NEXT: s_setpc_b64 s[30:31] 556; 557; GFX10-LABEL: shuffle_v4f16_2345: 558; GFX10: ; %bb.0: 559; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 560; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 561; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 562; GFX10-NEXT: global_load_dword v5, v[2:3], off 563; GFX10-NEXT: s_waitcnt vmcnt(1) 564; GFX10-NEXT: v_mov_b32_e32 v0, v4 565; GFX10-NEXT: s_waitcnt vmcnt(0) 566; GFX10-NEXT: v_mov_b32_e32 v1, v5 567; GFX10-NEXT: s_setpc_b64 s[30:31] 568; 569; GFX11-LABEL: shuffle_v4f16_2345: 570; GFX11: ; %bb.0: 571; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 572; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 573; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 574; GFX11-NEXT: global_load_b32 v1, v[2:3], off 575; GFX11-NEXT: s_waitcnt vmcnt(0) 576; GFX11-NEXT: s_setpc_b64 s[30:31] 577 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 578 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 579 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 580 ret <4 x half> %shuffle 581} 582 583define <4 x half> @shuffle_v4f16_2367(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 584; GFX9-LABEL: shuffle_v4f16_2367: 585; GFX9: ; %bb.0: 586; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 587; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 588; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 589; GFX9-NEXT: s_waitcnt vmcnt(1) 590; GFX9-NEXT: v_mov_b32_e32 v0, v4 591; GFX9-NEXT: s_waitcnt vmcnt(0) 592; GFX9-NEXT: v_mov_b32_e32 v1, v5 593; GFX9-NEXT: s_setpc_b64 s[30:31] 594; 595; GFX10-LABEL: shuffle_v4f16_2367: 596; GFX10: ; %bb.0: 597; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 598; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 599; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 600; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 601; GFX10-NEXT: s_waitcnt vmcnt(1) 602; GFX10-NEXT: v_mov_b32_e32 v0, v4 603; GFX10-NEXT: s_waitcnt vmcnt(0) 604; GFX10-NEXT: v_mov_b32_e32 v1, v5 605; GFX10-NEXT: s_setpc_b64 s[30:31] 606; 607; GFX11-LABEL: shuffle_v4f16_2367: 608; GFX11: ; %bb.0: 609; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 610; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 611; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 612; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 613; GFX11-NEXT: s_waitcnt vmcnt(0) 614; GFX11-NEXT: s_setpc_b64 s[30:31] 615 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 616 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 617 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 618 ret <4 x half> %shuffle 619} 620 621define <4 x half> @shuffle_v4f16_4501(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 622; GFX9-LABEL: shuffle_v4f16_4501: 623; GFX9: ; %bb.0: 624; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 625; GFX9-NEXT: global_load_dword v4, v[2:3], off 626; GFX9-NEXT: global_load_dword v5, v[0:1], off 627; GFX9-NEXT: s_waitcnt vmcnt(1) 628; GFX9-NEXT: v_mov_b32_e32 v0, v4 629; GFX9-NEXT: s_waitcnt vmcnt(0) 630; GFX9-NEXT: v_mov_b32_e32 v1, v5 631; GFX9-NEXT: s_setpc_b64 s[30:31] 632; 633; GFX10-LABEL: shuffle_v4f16_4501: 634; GFX10: ; %bb.0: 635; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 636; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 637; GFX10-NEXT: global_load_dword v4, v[2:3], off 638; GFX10-NEXT: global_load_dword v5, v[0:1], off 639; GFX10-NEXT: s_waitcnt vmcnt(1) 640; GFX10-NEXT: v_mov_b32_e32 v0, v4 641; GFX10-NEXT: s_waitcnt vmcnt(0) 642; GFX10-NEXT: v_mov_b32_e32 v1, v5 643; GFX10-NEXT: s_setpc_b64 s[30:31] 644; 645; GFX11-LABEL: shuffle_v4f16_4501: 646; GFX11: ; %bb.0: 647; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 648; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 649; GFX11-NEXT: global_load_b32 v2, v[2:3], off 650; GFX11-NEXT: global_load_b32 v1, v[0:1], off 651; GFX11-NEXT: s_waitcnt vmcnt(1) 652; GFX11-NEXT: v_mov_b32_e32 v0, v2 653; GFX11-NEXT: s_waitcnt vmcnt(0) 654; GFX11-NEXT: s_setpc_b64 s[30:31] 655 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 656 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 657 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 658 ret <4 x half> %shuffle 659} 660 661define <4 x half> @shuffle_v4f16_4523(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 662; GFX9-LABEL: shuffle_v4f16_4523: 663; GFX9: ; %bb.0: 664; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 665; GFX9-NEXT: global_load_dword v4, v[2:3], off 666; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 667; GFX9-NEXT: s_waitcnt vmcnt(1) 668; GFX9-NEXT: v_mov_b32_e32 v0, v4 669; GFX9-NEXT: s_waitcnt vmcnt(0) 670; GFX9-NEXT: v_mov_b32_e32 v1, v5 671; GFX9-NEXT: s_setpc_b64 s[30:31] 672; 673; GFX10-LABEL: shuffle_v4f16_4523: 674; GFX10: ; %bb.0: 675; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 676; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 677; GFX10-NEXT: global_load_dword v4, v[2:3], off 678; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 679; GFX10-NEXT: s_waitcnt vmcnt(1) 680; GFX10-NEXT: v_mov_b32_e32 v0, v4 681; GFX10-NEXT: s_waitcnt vmcnt(0) 682; GFX10-NEXT: v_mov_b32_e32 v1, v5 683; GFX10-NEXT: s_setpc_b64 s[30:31] 684; 685; GFX11-LABEL: shuffle_v4f16_4523: 686; GFX11: ; %bb.0: 687; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 688; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 689; GFX11-NEXT: global_load_b32 v2, v[2:3], off 690; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 691; GFX11-NEXT: s_waitcnt vmcnt(1) 692; GFX11-NEXT: v_mov_b32_e32 v0, v2 693; GFX11-NEXT: s_waitcnt vmcnt(0) 694; GFX11-NEXT: s_setpc_b64 s[30:31] 695 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 696 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 697 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 698 ret <4 x half> %shuffle 699} 700 701define <4 x half> @shuffle_v4f16_4545(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 702; GFX9-LABEL: shuffle_v4f16_4545: 703; GFX9: ; %bb.0: 704; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 705; GFX9-NEXT: global_load_dword v0, v[2:3], off 706; GFX9-NEXT: s_waitcnt vmcnt(0) 707; GFX9-NEXT: v_mov_b32_e32 v1, v0 708; GFX9-NEXT: s_setpc_b64 s[30:31] 709; 710; GFX10-LABEL: shuffle_v4f16_4545: 711; GFX10: ; %bb.0: 712; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 713; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 714; GFX10-NEXT: global_load_dword v0, v[2:3], off 715; GFX10-NEXT: s_waitcnt vmcnt(0) 716; GFX10-NEXT: v_mov_b32_e32 v1, v0 717; GFX10-NEXT: s_setpc_b64 s[30:31] 718; 719; GFX11-LABEL: shuffle_v4f16_4545: 720; GFX11: ; %bb.0: 721; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 722; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 723; GFX11-NEXT: global_load_b32 v0, v[2:3], off 724; GFX11-NEXT: s_waitcnt vmcnt(0) 725; GFX11-NEXT: v_mov_b32_e32 v1, v0 726; GFX11-NEXT: s_setpc_b64 s[30:31] 727 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 728 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 729 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 4, i32 5> 730 ret <4 x half> %shuffle 731} 732 733define <4 x half> @shuffle_v4f16_4567(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 734; GFX9-LABEL: shuffle_v4f16_4567: 735; GFX9: ; %bb.0: 736; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 737; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off 738; GFX9-NEXT: s_waitcnt vmcnt(0) 739; GFX9-NEXT: s_setpc_b64 s[30:31] 740; 741; GFX10-LABEL: shuffle_v4f16_4567: 742; GFX10: ; %bb.0: 743; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 744; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 745; GFX10-NEXT: global_load_dwordx2 v[0:1], v[2:3], off 746; GFX10-NEXT: s_waitcnt vmcnt(0) 747; GFX10-NEXT: s_setpc_b64 s[30:31] 748; 749; GFX11-LABEL: shuffle_v4f16_4567: 750; GFX11: ; %bb.0: 751; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 752; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 753; GFX11-NEXT: global_load_b64 v[0:1], v[2:3], off 754; GFX11-NEXT: s_waitcnt vmcnt(0) 755; GFX11-NEXT: s_setpc_b64 s[30:31] 756 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 757 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 758 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 759 ret <4 x half> %shuffle 760} 761 762define <4 x half> @shuffle_v4f16_6701(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 763; GFX9-LABEL: shuffle_v4f16_6701: 764; GFX9: ; %bb.0: 765; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 766; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 767; GFX9-NEXT: global_load_dword v5, v[0:1], off 768; GFX9-NEXT: s_waitcnt vmcnt(1) 769; GFX9-NEXT: v_mov_b32_e32 v0, v4 770; GFX9-NEXT: s_waitcnt vmcnt(0) 771; GFX9-NEXT: v_mov_b32_e32 v1, v5 772; GFX9-NEXT: s_setpc_b64 s[30:31] 773; 774; GFX10-LABEL: shuffle_v4f16_6701: 775; GFX10: ; %bb.0: 776; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 777; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 778; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 779; GFX10-NEXT: global_load_dword v5, v[0:1], off 780; GFX10-NEXT: s_waitcnt vmcnt(1) 781; GFX10-NEXT: v_mov_b32_e32 v0, v4 782; GFX10-NEXT: s_waitcnt vmcnt(0) 783; GFX10-NEXT: v_mov_b32_e32 v1, v5 784; GFX10-NEXT: s_setpc_b64 s[30:31] 785; 786; GFX11-LABEL: shuffle_v4f16_6701: 787; GFX11: ; %bb.0: 788; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 789; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 790; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4 791; GFX11-NEXT: global_load_b32 v1, v[0:1], off 792; GFX11-NEXT: s_waitcnt vmcnt(1) 793; GFX11-NEXT: v_mov_b32_e32 v0, v2 794; GFX11-NEXT: s_waitcnt vmcnt(0) 795; GFX11-NEXT: s_setpc_b64 s[30:31] 796 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 797 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 798 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 799 ret <4 x half> %shuffle 800} 801 802define <4 x half> @shuffle_v4f16_6723(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 803; GFX9-LABEL: shuffle_v4f16_6723: 804; GFX9: ; %bb.0: 805; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 806; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 807; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 808; GFX9-NEXT: s_waitcnt vmcnt(1) 809; GFX9-NEXT: v_mov_b32_e32 v0, v4 810; GFX9-NEXT: s_waitcnt vmcnt(0) 811; GFX9-NEXT: v_mov_b32_e32 v1, v5 812; GFX9-NEXT: s_setpc_b64 s[30:31] 813; 814; GFX10-LABEL: shuffle_v4f16_6723: 815; GFX10: ; %bb.0: 816; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 817; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 818; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 819; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 820; GFX10-NEXT: s_waitcnt vmcnt(1) 821; GFX10-NEXT: v_mov_b32_e32 v0, v4 822; GFX10-NEXT: s_waitcnt vmcnt(0) 823; GFX10-NEXT: v_mov_b32_e32 v1, v5 824; GFX10-NEXT: s_setpc_b64 s[30:31] 825; 826; GFX11-LABEL: shuffle_v4f16_6723: 827; GFX11: ; %bb.0: 828; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 829; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 830; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4 831; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 832; GFX11-NEXT: s_waitcnt vmcnt(1) 833; GFX11-NEXT: v_mov_b32_e32 v0, v2 834; GFX11-NEXT: s_waitcnt vmcnt(0) 835; GFX11-NEXT: s_setpc_b64 s[30:31] 836 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 837 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 838 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 2, i32 3> 839 ret <4 x half> %shuffle 840} 841 842define <4 x half> @shuffle_v4f16_6745(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 843; GFX9-LABEL: shuffle_v4f16_6745: 844; GFX9: ; %bb.0: 845; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 846; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off 847; GFX9-NEXT: s_waitcnt vmcnt(0) 848; GFX9-NEXT: v_mov_b32_e32 v0, v2 849; GFX9-NEXT: s_setpc_b64 s[30:31] 850; 851; GFX10-LABEL: shuffle_v4f16_6745: 852; GFX10: ; %bb.0: 853; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 854; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 855; GFX10-NEXT: global_load_dwordx2 v[1:2], v[2:3], off 856; GFX10-NEXT: s_waitcnt vmcnt(0) 857; GFX10-NEXT: v_mov_b32_e32 v0, v2 858; GFX10-NEXT: s_setpc_b64 s[30:31] 859; 860; GFX11-LABEL: shuffle_v4f16_6745: 861; GFX11: ; %bb.0: 862; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 863; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 864; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off 865; GFX11-NEXT: s_waitcnt vmcnt(0) 866; GFX11-NEXT: v_mov_b32_e32 v0, v2 867; GFX11-NEXT: s_setpc_b64 s[30:31] 868 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 869 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 870 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 4, i32 5> 871 ret <4 x half> %shuffle 872} 873 874define <4 x half> @shuffle_v4f16_6767(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 875; GFX9-LABEL: shuffle_v4f16_6767: 876; GFX9: ; %bb.0: 877; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 878; GFX9-NEXT: global_load_dword v0, v[2:3], off offset:4 879; GFX9-NEXT: s_waitcnt vmcnt(0) 880; GFX9-NEXT: v_mov_b32_e32 v1, v0 881; GFX9-NEXT: s_setpc_b64 s[30:31] 882; 883; GFX10-LABEL: shuffle_v4f16_6767: 884; GFX10: ; %bb.0: 885; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 886; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 887; GFX10-NEXT: global_load_dword v0, v[2:3], off offset:4 888; GFX10-NEXT: s_waitcnt vmcnt(0) 889; GFX10-NEXT: v_mov_b32_e32 v1, v0 890; GFX10-NEXT: s_setpc_b64 s[30:31] 891; 892; GFX11-LABEL: shuffle_v4f16_6767: 893; GFX11: ; %bb.0: 894; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 895; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 896; GFX11-NEXT: global_load_b32 v0, v[2:3], off offset:4 897; GFX11-NEXT: s_waitcnt vmcnt(0) 898; GFX11-NEXT: v_mov_b32_e32 v1, v0 899; GFX11-NEXT: s_setpc_b64 s[30:31] 900 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 901 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 902 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 6, i32 7> 903 ret <4 x half> %shuffle 904} 905 906define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 907; GFX9-LABEL: shuffle_v4f16_2356: 908; GFX9: ; %bb.0: 909; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 910; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 911; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 912; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 913; GFX9-NEXT: s_waitcnt vmcnt(1) 914; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 915; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v0 916; GFX9-NEXT: s_waitcnt vmcnt(0) 917; GFX9-NEXT: v_mov_b32_e32 v0, v4 918; GFX9-NEXT: s_setpc_b64 s[30:31] 919; 920; GFX10-LABEL: shuffle_v4f16_2356: 921; GFX10: ; %bb.0: 922; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 923; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 924; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 925; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 926; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 927; GFX10-NEXT: s_waitcnt vmcnt(1) 928; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 929; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v0 930; GFX10-NEXT: s_waitcnt vmcnt(0) 931; GFX10-NEXT: v_mov_b32_e32 v0, v4 932; GFX10-NEXT: s_setpc_b64 s[30:31] 933; 934; GFX11-LABEL: shuffle_v4f16_2356: 935; GFX11: ; %bb.0: 936; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 937; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 938; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off 939; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 940; GFX11-NEXT: s_waitcnt vmcnt(1) 941; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 942; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 943; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 944; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 945; GFX11-NEXT: s_waitcnt vmcnt(0) 946; GFX11-NEXT: s_setpc_b64 s[30:31] 947 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 948 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 949 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6> 950 ret <4 x half> %shuffle 951} 952 953define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 954; GFX9-LABEL: shuffle_v4f16_5623: 955; GFX9: ; %bb.0: 956; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 957; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 958; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 959; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 960; GFX9-NEXT: s_waitcnt vmcnt(1) 961; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 962; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 963; GFX9-NEXT: s_waitcnt vmcnt(0) 964; GFX9-NEXT: v_mov_b32_e32 v1, v4 965; GFX9-NEXT: s_setpc_b64 s[30:31] 966; 967; GFX10-LABEL: shuffle_v4f16_5623: 968; GFX10: ; %bb.0: 969; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 970; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 971; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 972; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 973; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 974; GFX10-NEXT: s_waitcnt vmcnt(1) 975; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 976; GFX10-NEXT: s_waitcnt vmcnt(0) 977; GFX10-NEXT: v_mov_b32_e32 v1, v4 978; GFX10-NEXT: v_lshl_or_b32 v0, v6, 16, v0 979; GFX10-NEXT: s_setpc_b64 s[30:31] 980; 981; GFX11-LABEL: shuffle_v4f16_5623: 982; GFX11: ; %bb.0: 983; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 984; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 985; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off 986; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 987; GFX11-NEXT: s_waitcnt vmcnt(1) 988; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 989; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 990; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 991; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v0 992; GFX11-NEXT: s_waitcnt vmcnt(0) 993; GFX11-NEXT: s_setpc_b64 s[30:31] 994 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 995 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 996 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3> 997 ret <4 x half> %shuffle 998} 999 1000define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 1001; GFX9-LABEL: shuffle_v4f16_3456: 1002; GFX9: ; %bb.0: 1003; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1004; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 1005; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 1006; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 1007; GFX9-NEXT: s_waitcnt vmcnt(1) 1008; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1009; GFX9-NEXT: s_waitcnt vmcnt(0) 1010; GFX9-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1011; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v1 1012; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v2 1013; GFX9-NEXT: s_setpc_b64 s[30:31] 1014; 1015; GFX10-LABEL: shuffle_v4f16_3456: 1016; GFX10: ; %bb.0: 1017; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1018; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1019; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 1020; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 1021; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 1022; GFX10-NEXT: s_waitcnt vmcnt(1) 1023; GFX10-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1024; GFX10-NEXT: s_waitcnt vmcnt(0) 1025; GFX10-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1026; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v1 1027; GFX10-NEXT: v_lshl_or_b32 v1, v5, 16, v2 1028; GFX10-NEXT: s_setpc_b64 s[30:31] 1029; 1030; GFX11-LABEL: shuffle_v4f16_3456: 1031; GFX11: ; %bb.0: 1032; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1033; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1034; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:4 1035; GFX11-NEXT: global_load_b64 v[0:1], v[2:3], off 1036; GFX11-NEXT: s_waitcnt vmcnt(1) 1037; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 1038; GFX11-NEXT: s_waitcnt vmcnt(0) 1039; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 1040; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1041; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 1042; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 1043; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1044; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v2 1045; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v3 1046; GFX11-NEXT: s_setpc_b64 s[30:31] 1047 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 1048 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 1049 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6> 1050 ret <4 x half> %shuffle 1051} 1052 1053define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 1054; GFX9-LABEL: shuffle_v4f16_5634: 1055; GFX9: ; %bb.0: 1056; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1057; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 1058; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 1059; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 1060; GFX9-NEXT: s_waitcnt vmcnt(1) 1061; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1062; GFX9-NEXT: s_waitcnt vmcnt(0) 1063; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1064; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 1065; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 1066; GFX9-NEXT: s_setpc_b64 s[30:31] 1067; 1068; GFX10-LABEL: shuffle_v4f16_5634: 1069; GFX10: ; %bb.0: 1070; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1071; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1072; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 1073; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 1074; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 1075; GFX10-NEXT: s_waitcnt vmcnt(1) 1076; GFX10-NEXT: v_and_b32_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1077; GFX10-NEXT: s_waitcnt vmcnt(0) 1078; GFX10-NEXT: v_and_b32_sdwa v2, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1079; GFX10-NEXT: v_lshl_or_b32 v0, v5, 16, v1 1080; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v2 1081; GFX10-NEXT: s_setpc_b64 s[30:31] 1082; 1083; GFX11-LABEL: shuffle_v4f16_5634: 1084; GFX11: ; %bb.0: 1085; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1086; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1087; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off 1088; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 1089; GFX11-NEXT: s_waitcnt vmcnt(1) 1090; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 1091; GFX11-NEXT: s_waitcnt vmcnt(0) 1092; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1093; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1094; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 1095; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0 1096; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1097; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v1 1098; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v4 1099; GFX11-NEXT: s_setpc_b64 s[30:31] 1100 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 1101 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 1102 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4> 1103 ret <4 x half> %shuffle 1104} 1105 1106define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 1107; GFX9-LABEL: shuffle_v4f16_5734: 1108; GFX9: ; %bb.0: 1109; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1110; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 1111; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 1112; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 1113; GFX9-NEXT: s_waitcnt vmcnt(1) 1114; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1115; GFX9-NEXT: s_waitcnt vmcnt(0) 1116; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1117; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v5 1118; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 1119; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 1120; GFX9-NEXT: s_setpc_b64 s[30:31] 1121; 1122; GFX10-LABEL: shuffle_v4f16_5734: 1123; GFX10: ; %bb.0: 1124; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1125; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1126; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 1127; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 1128; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 1129; GFX10-NEXT: s_waitcnt vmcnt(1) 1130; GFX10-NEXT: v_and_b32_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1131; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v5 1132; GFX10-NEXT: s_waitcnt vmcnt(0) 1133; GFX10-NEXT: v_and_b32_sdwa v3, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1134; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 1135; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 1136; GFX10-NEXT: s_setpc_b64 s[30:31] 1137; 1138; GFX11-LABEL: shuffle_v4f16_5734: 1139; GFX11: ; %bb.0: 1140; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1141; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1142; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off 1143; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 1144; GFX11-NEXT: s_waitcnt vmcnt(1) 1145; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 1146; GFX11-NEXT: s_waitcnt vmcnt(0) 1147; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1148; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1149; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 1150; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 1151; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0 1152; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1153; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v1 1154; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v4 1155; GFX11-NEXT: s_setpc_b64 s[30:31] 1156 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 1157 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 1158 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4> 1159 ret <4 x half> %shuffle 1160} 1161 1162define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) { 1163; GFX9-LABEL: shuffle_v4i16_2356: 1164; GFX9: ; %bb.0: 1165; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1166; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 1167; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 1168; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 1169; GFX9-NEXT: s_waitcnt vmcnt(1) 1170; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1171; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v0 1172; GFX9-NEXT: s_waitcnt vmcnt(0) 1173; GFX9-NEXT: v_mov_b32_e32 v0, v4 1174; GFX9-NEXT: s_setpc_b64 s[30:31] 1175; 1176; GFX10-LABEL: shuffle_v4i16_2356: 1177; GFX10: ; %bb.0: 1178; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1179; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1180; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 1181; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 1182; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 1183; GFX10-NEXT: s_waitcnt vmcnt(1) 1184; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1185; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v0 1186; GFX10-NEXT: s_waitcnt vmcnt(0) 1187; GFX10-NEXT: v_mov_b32_e32 v0, v4 1188; GFX10-NEXT: s_setpc_b64 s[30:31] 1189; 1190; GFX11-LABEL: shuffle_v4i16_2356: 1191; GFX11: ; %bb.0: 1192; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1193; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1194; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off 1195; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 1196; GFX11-NEXT: s_waitcnt vmcnt(1) 1197; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 1198; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1199; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 1200; GFX11-NEXT: s_waitcnt vmcnt(0) 1201; GFX11-NEXT: s_setpc_b64 s[30:31] 1202 %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 1203 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 1204 %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6> 1205 ret <4 x i16> %shuffle 1206} 1207 1208define <4 x i16> @shuffle_v4i16_0167(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) { 1209; GFX9-LABEL: shuffle_v4i16_0167: 1210; GFX9: ; %bb.0: 1211; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1212; GFX9-NEXT: global_load_dword v4, v[0:1], off 1213; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 1214; GFX9-NEXT: s_waitcnt vmcnt(1) 1215; GFX9-NEXT: v_mov_b32_e32 v0, v4 1216; GFX9-NEXT: s_waitcnt vmcnt(0) 1217; GFX9-NEXT: v_mov_b32_e32 v1, v5 1218; GFX9-NEXT: s_setpc_b64 s[30:31] 1219; 1220; GFX10-LABEL: shuffle_v4i16_0167: 1221; GFX10: ; %bb.0: 1222; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1223; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1224; GFX10-NEXT: global_load_dword v4, v[0:1], off 1225; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 1226; GFX10-NEXT: s_waitcnt vmcnt(1) 1227; GFX10-NEXT: v_mov_b32_e32 v0, v4 1228; GFX10-NEXT: s_waitcnt vmcnt(0) 1229; GFX10-NEXT: v_mov_b32_e32 v1, v5 1230; GFX10-NEXT: s_setpc_b64 s[30:31] 1231; 1232; GFX11-LABEL: shuffle_v4i16_0167: 1233; GFX11: ; %bb.0: 1234; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1235; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1236; GFX11-NEXT: global_load_b32 v0, v[0:1], off 1237; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 1238; GFX11-NEXT: s_waitcnt vmcnt(0) 1239; GFX11-NEXT: s_setpc_b64 s[30:31] 1240 %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 1241 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 1242 %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 1243 ret <4 x i16> %shuffle 1244} 1245 1246define <4 x half> @shuffle_v4f16_0000(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 1247; GFX9-LABEL: shuffle_v4f16_0000: 1248; GFX9: ; %bb.0: 1249; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1250; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1251; GFX9-NEXT: s_waitcnt vmcnt(0) 1252; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 1253; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 1254; GFX9-NEXT: v_mov_b32_e32 v1, v0 1255; GFX9-NEXT: s_setpc_b64 s[30:31] 1256; 1257; GFX10-LABEL: shuffle_v4f16_0000: 1258; GFX10: ; %bb.0: 1259; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1260; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1261; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1262; GFX10-NEXT: s_waitcnt vmcnt(0) 1263; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v0 1264; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 1265; GFX10-NEXT: v_mov_b32_e32 v1, v0 1266; GFX10-NEXT: s_setpc_b64 s[30:31] 1267; 1268; GFX11-LABEL: shuffle_v4f16_0000: 1269; GFX11: ; %bb.0: 1270; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1271; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1272; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 1273; GFX11-NEXT: s_waitcnt vmcnt(0) 1274; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v0 1275; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1276; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 1277; GFX11-NEXT: v_mov_b32_e32 v1, v0 1278; GFX11-NEXT: s_setpc_b64 s[30:31] 1279 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 1280 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 1281 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> zeroinitializer 1282 ret <4 x half> %shuffle 1283} 1284 1285define <4 x half> @shuffle_v4f16_1010(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 1286; GFX9-LABEL: shuffle_v4f16_1010: 1287; GFX9: ; %bb.0: 1288; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1289; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1290; GFX9-NEXT: s_waitcnt vmcnt(0) 1291; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 1292; GFX9-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1293; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 1294; GFX9-NEXT: v_mov_b32_e32 v1, v0 1295; GFX9-NEXT: s_setpc_b64 s[30:31] 1296; 1297; GFX10-LABEL: shuffle_v4f16_1010: 1298; GFX10: ; %bb.0: 1299; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1300; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1301; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1302; GFX10-NEXT: s_waitcnt vmcnt(0) 1303; GFX10-NEXT: v_mov_b32_e32 v1, 0xffff 1304; GFX10-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1305; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 1306; GFX10-NEXT: v_mov_b32_e32 v1, v0 1307; GFX10-NEXT: s_setpc_b64 s[30:31] 1308; 1309; GFX11-LABEL: shuffle_v4f16_1010: 1310; GFX11: ; %bb.0: 1311; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1312; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1313; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 1314; GFX11-NEXT: s_waitcnt vmcnt(0) 1315; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1316; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1317; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 1318; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 1319; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1320; GFX11-NEXT: v_mov_b32_e32 v1, v0 1321; GFX11-NEXT: s_setpc_b64 s[30:31] 1322 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 1323 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 1324 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 0, i32 1, i32 0> 1325 ret <4 x half> %shuffle 1326} 1327 1328define <4 x half> @shuffle_v4f16_1100(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 1329; GFX9-LABEL: shuffle_v4f16_1100: 1330; GFX9: ; %bb.0: 1331; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1332; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1333; GFX9-NEXT: s_waitcnt vmcnt(0) 1334; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 1335; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 1336; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 1337; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 1338; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 1339; GFX9-NEXT: s_setpc_b64 s[30:31] 1340; 1341; GFX10-LABEL: shuffle_v4f16_1100: 1342; GFX10: ; %bb.0: 1343; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1344; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1345; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 1346; GFX10-NEXT: s_waitcnt vmcnt(0) 1347; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v1 1348; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v1 1349; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v0 1350; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 1351; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 1352; GFX10-NEXT: s_setpc_b64 s[30:31] 1353; 1354; GFX11-LABEL: shuffle_v4f16_1100: 1355; GFX11: ; %bb.0: 1356; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1357; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1358; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off 1359; GFX11-NEXT: s_waitcnt vmcnt(0) 1360; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1 1361; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v1 1362; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1363; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v0 1364; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v3 1365; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 1366; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v2 1367; GFX11-NEXT: s_setpc_b64 s[30:31] 1368 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 1369 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 1370 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0> 1371 ret <4 x half> %shuffle 1372} 1373 1374define <4 x half> @shuffle_v4f16_6161(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 1375; GFX9-LABEL: shuffle_v4f16_6161: 1376; GFX9: ; %bb.0: 1377; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1378; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 1379; GFX9-NEXT: global_load_dword v5, v[0:1], off 1380; GFX9-NEXT: s_waitcnt vmcnt(1) 1381; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 1382; GFX9-NEXT: s_waitcnt vmcnt(0) 1383; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5 1384; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 1385; GFX9-NEXT: v_mov_b32_e32 v1, v0 1386; GFX9-NEXT: s_setpc_b64 s[30:31] 1387; 1388; GFX10-LABEL: shuffle_v4f16_6161: 1389; GFX10: ; %bb.0: 1390; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1391; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1392; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 1393; GFX10-NEXT: global_load_dword v5, v[0:1], off 1394; GFX10-NEXT: s_waitcnt vmcnt(1) 1395; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 1396; GFX10-NEXT: s_waitcnt vmcnt(0) 1397; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v5 1398; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 1399; GFX10-NEXT: v_mov_b32_e32 v1, v0 1400; GFX10-NEXT: s_setpc_b64 s[30:31] 1401; 1402; GFX11-LABEL: shuffle_v4f16_6161: 1403; GFX11: ; %bb.0: 1404; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1405; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1406; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4 1407; GFX11-NEXT: global_load_b32 v0, v[0:1], off 1408; GFX11-NEXT: s_waitcnt vmcnt(1) 1409; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v2 1410; GFX11-NEXT: s_waitcnt vmcnt(0) 1411; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1412; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1413; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 1414; GFX11-NEXT: v_mov_b32_e32 v1, v0 1415; GFX11-NEXT: s_setpc_b64 s[30:31] 1416 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 1417 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 1418 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1> 1419 ret <4 x half> %shuffle 1420} 1421 1422define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 1423; GFX9-LABEL: shuffle_v4f16_2333: 1424; GFX9: ; %bb.0: 1425; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1426; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 1427; GFX9-NEXT: s_waitcnt vmcnt(0) 1428; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1429; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 1430; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 1431; GFX9-NEXT: s_setpc_b64 s[30:31] 1432; 1433; GFX10-LABEL: shuffle_v4f16_2333: 1434; GFX10: ; %bb.0: 1435; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1436; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1437; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 1438; GFX10-NEXT: s_waitcnt vmcnt(0) 1439; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1440; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v1 1441; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 1442; GFX10-NEXT: s_setpc_b64 s[30:31] 1443; 1444; GFX11-LABEL: shuffle_v4f16_2333: 1445; GFX11: ; %bb.0: 1446; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1447; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1448; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 1449; GFX11-NEXT: s_waitcnt vmcnt(0) 1450; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1451; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1452; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v1 1453; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v2 1454; GFX11-NEXT: s_setpc_b64 s[30:31] 1455 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 1456 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 1457 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3> 1458 ret <4 x half> %shuffle 1459} 1460 1461define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 1462; GFX9-LABEL: shuffle_v4f16_6667: 1463; GFX9: ; %bb.0: 1464; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1465; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 1466; GFX9-NEXT: s_waitcnt vmcnt(0) 1467; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1468; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 1469; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 1470; GFX9-NEXT: s_setpc_b64 s[30:31] 1471; 1472; GFX10-LABEL: shuffle_v4f16_6667: 1473; GFX10: ; %bb.0: 1474; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1475; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1476; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 1477; GFX10-NEXT: s_waitcnt vmcnt(0) 1478; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1479; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v1 1480; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 1481; GFX10-NEXT: s_setpc_b64 s[30:31] 1482; 1483; GFX11-LABEL: shuffle_v4f16_6667: 1484; GFX11: ; %bb.0: 1485; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1486; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1487; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 1488; GFX11-NEXT: s_waitcnt vmcnt(0) 1489; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1490; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1491; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v1 1492; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v2 1493; GFX11-NEXT: s_setpc_b64 s[30:31] 1494 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 1495 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 1496 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3> 1497 ret <4 x half> %shuffle 1498} 1499 1500define <4 x half> @shuffle_v8f16_0101(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 1501; GFX9-LABEL: shuffle_v8f16_0101: 1502; GFX9: ; %bb.0: 1503; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1504; GFX9-NEXT: global_load_dword v0, v[0:1], off 1505; GFX9-NEXT: s_waitcnt vmcnt(0) 1506; GFX9-NEXT: v_mov_b32_e32 v1, v0 1507; GFX9-NEXT: s_setpc_b64 s[30:31] 1508; 1509; GFX10-LABEL: shuffle_v8f16_0101: 1510; GFX10: ; %bb.0: 1511; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1512; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1513; GFX10-NEXT: global_load_dword v0, v[0:1], off 1514; GFX10-NEXT: s_waitcnt vmcnt(0) 1515; GFX10-NEXT: v_mov_b32_e32 v1, v0 1516; GFX10-NEXT: s_setpc_b64 s[30:31] 1517; 1518; GFX11-LABEL: shuffle_v8f16_0101: 1519; GFX11: ; %bb.0: 1520; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1521; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1522; GFX11-NEXT: global_load_b32 v0, v[0:1], off 1523; GFX11-NEXT: s_waitcnt vmcnt(0) 1524; GFX11-NEXT: v_mov_b32_e32 v1, v0 1525; GFX11-NEXT: s_setpc_b64 s[30:31] 1526 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 1527 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 1528 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 1529 ret <4 x half> %shuffle 1530} 1531 1532define <4 x half> @shuffle_v8f16_0123(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 1533; GFX9-LABEL: shuffle_v8f16_0123: 1534; GFX9: ; %bb.0: 1535; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1536; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1537; GFX9-NEXT: s_waitcnt vmcnt(0) 1538; GFX9-NEXT: s_setpc_b64 s[30:31] 1539; 1540; GFX10-LABEL: shuffle_v8f16_0123: 1541; GFX10: ; %bb.0: 1542; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1543; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1544; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1545; GFX10-NEXT: s_waitcnt vmcnt(0) 1546; GFX10-NEXT: s_setpc_b64 s[30:31] 1547; 1548; GFX11-LABEL: shuffle_v8f16_0123: 1549; GFX11: ; %bb.0: 1550; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1551; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1552; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 1553; GFX11-NEXT: s_waitcnt vmcnt(0) 1554; GFX11-NEXT: s_setpc_b64 s[30:31] 1555 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 1556 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 1557 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1558 ret <4 x half> %shuffle 1559} 1560 1561define <4 x half> @shuffle_v8f16_4589(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 1562; GFX9-LABEL: shuffle_v8f16_4589: 1563; GFX9: ; %bb.0: 1564; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1565; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:8 1566; GFX9-NEXT: global_load_dword v5, v[2:3], off 1567; GFX9-NEXT: s_waitcnt vmcnt(1) 1568; GFX9-NEXT: v_mov_b32_e32 v0, v4 1569; GFX9-NEXT: s_waitcnt vmcnt(0) 1570; GFX9-NEXT: v_mov_b32_e32 v1, v5 1571; GFX9-NEXT: s_setpc_b64 s[30:31] 1572; 1573; GFX10-LABEL: shuffle_v8f16_4589: 1574; GFX10: ; %bb.0: 1575; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1576; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1577; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:8 1578; GFX10-NEXT: global_load_dword v5, v[2:3], off 1579; GFX10-NEXT: s_waitcnt vmcnt(1) 1580; GFX10-NEXT: v_mov_b32_e32 v0, v4 1581; GFX10-NEXT: s_waitcnt vmcnt(0) 1582; GFX10-NEXT: v_mov_b32_e32 v1, v5 1583; GFX10-NEXT: s_setpc_b64 s[30:31] 1584; 1585; GFX11-LABEL: shuffle_v8f16_4589: 1586; GFX11: ; %bb.0: 1587; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1588; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1589; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:8 1590; GFX11-NEXT: global_load_b32 v1, v[2:3], off 1591; GFX11-NEXT: s_waitcnt vmcnt(0) 1592; GFX11-NEXT: s_setpc_b64 s[30:31] 1593 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 1594 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 1595 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 4, i32 5, i32 8, i32 9> 1596 ret <4 x half> %shuffle 1597} 1598 1599define <4 x half> @shuffle_v8f16_10_11_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 1600; GFX9-LABEL: shuffle_v8f16_10_11_2_3: 1601; GFX9: ; %bb.0: 1602; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1603; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 1604; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 1605; GFX9-NEXT: s_waitcnt vmcnt(1) 1606; GFX9-NEXT: v_mov_b32_e32 v0, v4 1607; GFX9-NEXT: s_waitcnt vmcnt(0) 1608; GFX9-NEXT: v_mov_b32_e32 v1, v5 1609; GFX9-NEXT: s_setpc_b64 s[30:31] 1610; 1611; GFX10-LABEL: shuffle_v8f16_10_11_2_3: 1612; GFX10: ; %bb.0: 1613; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1614; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1615; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 1616; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 1617; GFX10-NEXT: s_waitcnt vmcnt(1) 1618; GFX10-NEXT: v_mov_b32_e32 v0, v4 1619; GFX10-NEXT: s_waitcnt vmcnt(0) 1620; GFX10-NEXT: v_mov_b32_e32 v1, v5 1621; GFX10-NEXT: s_setpc_b64 s[30:31] 1622; 1623; GFX11-LABEL: shuffle_v8f16_10_11_2_3: 1624; GFX11: ; %bb.0: 1625; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1626; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1627; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4 1628; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 1629; GFX11-NEXT: s_waitcnt vmcnt(1) 1630; GFX11-NEXT: v_mov_b32_e32 v0, v2 1631; GFX11-NEXT: s_waitcnt vmcnt(0) 1632; GFX11-NEXT: s_setpc_b64 s[30:31] 1633 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 1634 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 1635 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 10, i32 11, i32 2, i32 3> 1636 ret <4 x half> %shuffle 1637} 1638 1639define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 1640; GFX9-LABEL: shuffle_v8f16_13_14_2_3: 1641; GFX9: ; %bb.0: 1642; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1643; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8 1644; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 1645; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 1646; GFX9-NEXT: s_waitcnt vmcnt(1) 1647; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1648; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 1649; GFX9-NEXT: s_waitcnt vmcnt(0) 1650; GFX9-NEXT: v_mov_b32_e32 v1, v4 1651; GFX9-NEXT: s_setpc_b64 s[30:31] 1652; 1653; GFX10-LABEL: shuffle_v8f16_13_14_2_3: 1654; GFX10: ; %bb.0: 1655; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1656; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1657; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8 1658; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 1659; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 1660; GFX10-NEXT: s_waitcnt vmcnt(1) 1661; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1662; GFX10-NEXT: s_waitcnt vmcnt(0) 1663; GFX10-NEXT: v_mov_b32_e32 v1, v4 1664; GFX10-NEXT: v_lshl_or_b32 v0, v6, 16, v0 1665; GFX10-NEXT: s_setpc_b64 s[30:31] 1666; 1667; GFX11-LABEL: shuffle_v8f16_13_14_2_3: 1668; GFX11: ; %bb.0: 1669; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1670; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1671; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off offset:8 1672; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 1673; GFX11-NEXT: s_waitcnt vmcnt(1) 1674; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 1675; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1676; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 1677; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v0 1678; GFX11-NEXT: s_waitcnt vmcnt(0) 1679; GFX11-NEXT: s_setpc_b64 s[30:31] 1680 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 1681 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 1682 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3> 1683 ret <4 x half> %shuffle 1684} 1685 1686define <4 x half> @shuffle_v3f16_0122(<3 x half> addrspace(1)* %arg0, <3 x half> addrspace(1)* %arg1) { 1687; GFX9-LABEL: shuffle_v3f16_0122: 1688; GFX9: ; %bb.0: 1689; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1690; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1691; GFX9-NEXT: s_waitcnt vmcnt(0) 1692; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 1693; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 1694; GFX9-NEXT: s_setpc_b64 s[30:31] 1695; 1696; GFX10-LABEL: shuffle_v3f16_0122: 1697; GFX10: ; %bb.0: 1698; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1699; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1700; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1701; GFX10-NEXT: s_waitcnt vmcnt(0) 1702; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v1 1703; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 1704; GFX10-NEXT: s_setpc_b64 s[30:31] 1705; 1706; GFX11-LABEL: shuffle_v3f16_0122: 1707; GFX11: ; %bb.0: 1708; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1709; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1710; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 1711; GFX11-NEXT: s_waitcnt vmcnt(0) 1712; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v1 1713; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1714; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v2 1715; GFX11-NEXT: s_setpc_b64 s[30:31] 1716 %val0 = load <3 x half>, <3 x half> addrspace(1)* %arg0 1717 %val1 = load <3 x half>, <3 x half> addrspace(1)* %arg1 1718 %shuffle = shufflevector <3 x half> %val0, <3 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 1719 ret <4 x half> %shuffle 1720} 1721 1722define <4 x half> @shuffle_v2f16_0122(<2 x half> addrspace(1)* %arg0, <2 x half> addrspace(1)* %arg1) { 1723; GFX9-LABEL: shuffle_v2f16_0122: 1724; GFX9: ; %bb.0: 1725; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1726; GFX9-NEXT: global_load_dword v0, v[0:1], off 1727; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 1728; GFX9-NEXT: s_waitcnt vmcnt(0) 1729; GFX9-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1730; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 1731; GFX9-NEXT: s_setpc_b64 s[30:31] 1732; 1733; GFX10-LABEL: shuffle_v2f16_0122: 1734; GFX10: ; %bb.0: 1735; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1736; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1737; GFX10-NEXT: global_load_dword v0, v[0:1], off 1738; GFX10-NEXT: v_mov_b32_e32 v1, 0xffff 1739; GFX10-NEXT: s_waitcnt vmcnt(0) 1740; GFX10-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1741; GFX10-NEXT: v_lshl_or_b32 v1, v0, 16, v1 1742; GFX10-NEXT: s_setpc_b64 s[30:31] 1743; 1744; GFX11-LABEL: shuffle_v2f16_0122: 1745; GFX11: ; %bb.0: 1746; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1747; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1748; GFX11-NEXT: global_load_b32 v0, v[0:1], off 1749; GFX11-NEXT: s_waitcnt vmcnt(0) 1750; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1751; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1752; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 1753; GFX11-NEXT: v_lshl_or_b32 v1, v0, 16, v1 1754; GFX11-NEXT: s_setpc_b64 s[30:31] 1755 %val0 = load <2 x half>, <2 x half> addrspace(1)* %arg0 1756 %val1 = load <2 x half>, <2 x half> addrspace(1)* %arg1 1757 %shuffle = shufflevector <2 x half> %val0, <2 x half> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0> 1758 ret <4 x half> %shuffle 1759} 1760 1761define <6 x half> @shuffle_v6f16_452367(<6 x half> addrspace(1)* %arg0, <6 x half> addrspace(1)* %arg1) { 1762; GFX9-LABEL: shuffle_v6f16_452367: 1763; GFX9: ; %bb.0: 1764; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1765; GFX9-NEXT: v_mov_b32_e32 v6, v1 1766; GFX9-NEXT: v_mov_b32_e32 v5, v0 1767; GFX9-NEXT: v_mov_b32_e32 v4, v3 1768; GFX9-NEXT: v_mov_b32_e32 v3, v2 1769; GFX9-NEXT: global_load_dwordx3 v[0:2], v[5:6], off 1770; GFX9-NEXT: global_load_dword v7, v[3:4], off 1771; GFX9-NEXT: s_waitcnt vmcnt(1) 1772; GFX9-NEXT: v_mov_b32_e32 v0, v2 1773; GFX9-NEXT: s_waitcnt vmcnt(0) 1774; GFX9-NEXT: v_mov_b32_e32 v2, v7 1775; GFX9-NEXT: s_setpc_b64 s[30:31] 1776; 1777; GFX10-LABEL: shuffle_v6f16_452367: 1778; GFX10: ; %bb.0: 1779; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1780; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1781; GFX10-NEXT: v_mov_b32_e32 v6, v1 1782; GFX10-NEXT: v_mov_b32_e32 v5, v0 1783; GFX10-NEXT: v_mov_b32_e32 v4, v3 1784; GFX10-NEXT: v_mov_b32_e32 v3, v2 1785; GFX10-NEXT: global_load_dwordx3 v[0:2], v[5:6], off 1786; GFX10-NEXT: global_load_dword v7, v[3:4], off 1787; GFX10-NEXT: s_waitcnt vmcnt(1) 1788; GFX10-NEXT: v_mov_b32_e32 v0, v2 1789; GFX10-NEXT: s_waitcnt vmcnt(0) 1790; GFX10-NEXT: v_mov_b32_e32 v2, v7 1791; GFX10-NEXT: s_setpc_b64 s[30:31] 1792; 1793; GFX11-LABEL: shuffle_v6f16_452367: 1794; GFX11: ; %bb.0: 1795; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1796; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1797; GFX11-NEXT: global_load_b96 v[4:6], v[0:1], off 1798; GFX11-NEXT: s_waitcnt vmcnt(0) 1799; GFX11-NEXT: scratch_store_b96 off, v[4:6], s32 1800; GFX11-NEXT: global_load_b96 v[4:6], v[2:3], off 1801; GFX11-NEXT: scratch_load_b128 v[0:3], off, s32 1802; GFX11-NEXT: s_waitcnt vmcnt(1) 1803; GFX11-NEXT: scratch_store_b96 off, v[4:6], s32 offset:16 1804; GFX11-NEXT: s_waitcnt vmcnt(0) 1805; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:16 1806; GFX11-NEXT: v_mov_b32_e32 v0, v2 1807; GFX11-NEXT: s_waitcnt vmcnt(0) 1808; GFX11-NEXT: v_mov_b32_e32 v2, v3 1809; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1810; GFX11-NEXT: s_setpc_b64 s[30:31] 1811 %val0 = load <6 x half>, <6 x half> addrspace(1)* %arg0 1812 %val1 = load <6 x half>, <6 x half> addrspace(1)* %arg1 1813 %shuffle = shufflevector <6 x half> %val0, <6 x half> %val1, <6 x i32> <i32 4, i32 5, i32 2, i32 3, i32 6, i32 7> 1814 ret <6 x half> %shuffle 1815} 1816 1817define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readonly %A, <4 x half> addrspace(1)* nocapture readonly %B, <4 x half> addrspace(1)* nocapture %C) { 1818; GFX9-LABEL: fma_shuffle: 1819; GFX9: ; %bb.0: ; %entry 1820; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1821; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 1822; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0 1823; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1824; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] 1825; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] 1826; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7] 1827; GFX9-NEXT: s_waitcnt vmcnt(0) 1828; GFX9-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] 1829; GFX9-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] 1830; GFX9-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] 1831; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] 1832; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] 1833; GFX9-NEXT: s_endpgm 1834; 1835; GFX10-LABEL: fma_shuffle: 1836; GFX10: ; %bb.0: ; %entry 1837; GFX10-NEXT: s_clause 0x1 1838; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1839; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 1840; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 1841; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1842; GFX10-NEXT: s_clause 0x2 1843; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] 1844; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] 1845; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7] 1846; GFX10-NEXT: s_waitcnt vmcnt(0) 1847; GFX10-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] 1848; GFX10-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] 1849; GFX10-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] 1850; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] 1851; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] 1852; GFX10-NEXT: s_endpgm 1853; 1854; GFX11-LABEL: fma_shuffle: 1855; GFX11: ; %bb.0: ; %entry 1856; GFX11-NEXT: s_clause 0x1 1857; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 1858; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 1859; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 1860; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1861; GFX11-NEXT: s_clause 0x2 1862; GFX11-NEXT: global_load_b64 v[0:1], v6, s[4:5] 1863; GFX11-NEXT: global_load_b64 v[2:3], v6, s[6:7] 1864; GFX11-NEXT: global_load_b64 v[4:5], v6, s[0:1] 1865; GFX11-NEXT: s_waitcnt vmcnt(0) 1866; GFX11-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] 1867; GFX11-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] 1868; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1869; GFX11-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] 1870; GFX11-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] 1871; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] 1872; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1873; GFX11-NEXT: s_endpgm 1874entry: 1875 %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() 1876 %tmp12 = zext i32 %tmp1 to i64 1877 %arrayidx = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %A, i64 %tmp12 1878 %tmp14 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx, align 8 1879 %arrayidx1 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %B, i64 %tmp12 1880 %tmp15 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx1, align 8 1881 %arrayidx2 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %C, i64 %tmp12 1882 %tmp16 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx2, align 8 1883 %tmp17 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> zeroinitializer 1884 %tmp18 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 0, i32 1> 1885 %tmp19 = shufflevector <4 x half> %tmp16, <4 x half> undef, <2 x i32> <i32 0, i32 1> 1886 %tmp20 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp17, <2 x half> %tmp18, <2 x half> %tmp19) 1887 %tmp21 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 1, i32 1> 1888 %tmp22 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 2, i32 3> 1889 %tmp23 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp21, <2 x half> %tmp22, <2 x half> %tmp20) 1890 %tmp24 = shufflevector <2 x half> %tmp23, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1891 %tmp25 = shufflevector <4 x half> %tmp24, <4 x half> %tmp16, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 1892 %tmp26 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 2, i32 2> 1893 %tmp27 = shufflevector <4 x half> %tmp25, <4 x half> undef, <2 x i32> <i32 2, i32 3> 1894 %tmp28 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp26, <2 x half> %tmp18, <2 x half> %tmp27) 1895 %tmp29 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 3, i32 3> 1896 %tmp30 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp29, <2 x half> %tmp22, <2 x half> %tmp28) 1897 %tmp31 = shufflevector <2 x half> %tmp30, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1898 %tmp32 = shufflevector <4 x half> %tmp25, <4 x half> %tmp31, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1899 store <4 x half> %tmp32, <4 x half> addrspace(1)* %arrayidx2, align 8 1900 ret void 1901} 1902 1903define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 1904; GFX9-LABEL: shuffle_v4f16_0456: 1905; GFX9: ; %bb.0: 1906; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1907; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off 1908; GFX9-NEXT: s_waitcnt vmcnt(0) 1909; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 1910; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 1911; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 1912; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 1913; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v4 1914; GFX9-NEXT: s_waitcnt vmcnt(0) 1915; GFX9-NEXT: v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1916; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v1 1917; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v2 1918; GFX9-NEXT: s_setpc_b64 s[30:31] 1919; 1920; GFX10-LABEL: shuffle_v4f16_0456: 1921; GFX10: ; %bb.0: 1922; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1923; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1924; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off 1925; GFX10-NEXT: s_waitcnt vmcnt(0) 1926; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 1927; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1 1928; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 1929; GFX10-NEXT: ; kill: killed $vgpr2 killed $vgpr3 1930; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4 1931; GFX10-NEXT: s_waitcnt vmcnt(0) 1932; GFX10-NEXT: v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1933; GFX10-NEXT: v_lshl_or_b32 v0, v5, 16, v1 1934; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v2 1935; GFX10-NEXT: s_setpc_b64 s[30:31] 1936; 1937; GFX11-LABEL: shuffle_v4f16_0456: 1938; GFX11: ; %bb.0: 1939; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1940; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1941; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off 1942; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 1943; GFX11-NEXT: s_waitcnt vmcnt(0) 1944; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 1945; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 1946; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1947; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 1948; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 1949; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 1950; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 1951; GFX11-NEXT: s_setpc_b64 s[30:31] 1952 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 1953 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 1954 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6> 1955 ret <4 x half> %shuffle 1956} 1957 1958define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(<8 x i32> addrspace(4)* %in, <4 x i32> addrspace(1)* %out) { 1959; GFX9-LABEL: shuffle_scalar_load_v8i32_0123: 1960; GFX9: ; %bb.0: 1961; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1962; GFX9-NEXT: v_mov_b32_e32 v4, 0 1963; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1964; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 1965; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1966; GFX9-NEXT: v_mov_b32_e32 v0, s4 1967; GFX9-NEXT: v_mov_b32_e32 v1, s5 1968; GFX9-NEXT: v_mov_b32_e32 v2, s6 1969; GFX9-NEXT: v_mov_b32_e32 v3, s7 1970; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 1971; GFX9-NEXT: s_endpgm 1972; 1973; GFX10-LABEL: shuffle_scalar_load_v8i32_0123: 1974; GFX10: ; %bb.0: 1975; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1976; GFX10-NEXT: v_mov_b32_e32 v4, 0 1977; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1978; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 1979; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1980; GFX10-NEXT: v_mov_b32_e32 v0, s4 1981; GFX10-NEXT: v_mov_b32_e32 v1, s5 1982; GFX10-NEXT: v_mov_b32_e32 v2, s6 1983; GFX10-NEXT: v_mov_b32_e32 v3, s7 1984; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 1985; GFX10-NEXT: s_endpgm 1986; 1987; GFX11-LABEL: shuffle_scalar_load_v8i32_0123: 1988; GFX11: ; %bb.0: 1989; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 1990; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1991; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 1992; GFX11-NEXT: v_mov_b32_e32 v4, 0 1993; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1994; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 1995; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 1996; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3] 1997; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1998; GFX11-NEXT: s_endpgm 1999 %ld8 = load <8 x i32>, <8 x i32> addrspace(4)* %in, align 16 2000 %id = shufflevector <8 x i32> %ld8, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2001 store <4 x i32> %id, <4 x i32> addrspace(1)* %out, align 8 2002 ret void 2003} 2004 2005declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0 2006declare i32 @llvm.amdgcn.workitem.id.x() #0 2007 2008attributes #0 = { nounwind readnone speculatable } 2009