1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s 4 5define <4 x half> @shuffle_v4f16_23uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 6; GFX9-LABEL: shuffle_v4f16_23uu: 7; GFX9: ; %bb.0: 8; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 10; GFX9-NEXT: s_waitcnt vmcnt(0) 11; GFX9-NEXT: s_setpc_b64 s[30:31] 12; 13; GFX10-LABEL: shuffle_v4f16_23uu: 14; GFX10: ; %bb.0: 15; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 17; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 18; GFX10-NEXT: s_waitcnt vmcnt(0) 19; GFX10-NEXT: s_setpc_b64 s[30:31] 20 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 21 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 22 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 23 ret <4 x half> %shuffle 24} 25 26define <4 x half> @shuffle_v4f16_234u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 27; GFX9-LABEL: shuffle_v4f16_234u: 28; GFX9: ; %bb.0: 29; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 31; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 32; GFX9-NEXT: s_waitcnt vmcnt(1) 33; GFX9-NEXT: v_mov_b32_e32 v0, v6 34; GFX9-NEXT: s_waitcnt vmcnt(0) 35; GFX9-NEXT: v_mov_b32_e32 v1, v4 36; GFX9-NEXT: s_setpc_b64 s[30:31] 37; 38; GFX10-LABEL: shuffle_v4f16_234u: 39; GFX10: ; %bb.0: 40; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 42; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 43; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 44; GFX10-NEXT: s_waitcnt vmcnt(1) 45; GFX10-NEXT: v_mov_b32_e32 v0, v6 46; GFX10-NEXT: s_waitcnt vmcnt(0) 47; GFX10-NEXT: v_mov_b32_e32 v1, v4 48; GFX10-NEXT: s_setpc_b64 s[30:31] 49 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 50 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 51 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef> 52 ret <4 x half> %shuffle 53} 54 55define <4 x half> @shuffle_v4f16_u1u3(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 56; GFX9-LABEL: shuffle_v4f16_u1u3: 57; GFX9: ; %bb.0: 58; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 59; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 60; GFX9-NEXT: s_waitcnt vmcnt(0) 61; GFX9-NEXT: s_setpc_b64 s[30:31] 62; 63; GFX10-LABEL: shuffle_v4f16_u1u3: 64; GFX10: ; %bb.0: 65; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 66; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 67; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 68; GFX10-NEXT: s_waitcnt vmcnt(0) 69; GFX10-NEXT: s_setpc_b64 s[30:31] 70 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 71 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 72 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 1, i32 undef, i32 3> 73 ret <4 x half> %shuffle 74} 75 76define <4 x half> @shuffle_v4f16_u3u1(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 77; GFX9-LABEL: shuffle_v4f16_u3u1: 78; GFX9: ; %bb.0: 79; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 80; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 81; GFX9-NEXT: s_waitcnt vmcnt(0) 82; GFX9-NEXT: v_mov_b32_e32 v0, v2 83; GFX9-NEXT: s_setpc_b64 s[30:31] 84; 85; GFX10-LABEL: shuffle_v4f16_u3u1: 86; GFX10: ; %bb.0: 87; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 88; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 89; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 90; GFX10-NEXT: s_waitcnt vmcnt(0) 91; GFX10-NEXT: v_mov_b32_e32 v0, v2 92; GFX10-NEXT: s_setpc_b64 s[30:31] 93 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 94 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 95 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 1> 96 ret <4 x half> %shuffle 97} 98 99define <4 x half> @shuffle_v4f16_u3uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 100; GFX9-LABEL: shuffle_v4f16_u3uu: 101; GFX9: ; %bb.0: 102; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 103; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 104; GFX9-NEXT: s_waitcnt vmcnt(0) 105; GFX9-NEXT: s_setpc_b64 s[30:31] 106; 107; GFX10-LABEL: shuffle_v4f16_u3uu: 108; GFX10: ; %bb.0: 109; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 110; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 111; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 112; GFX10-NEXT: s_waitcnt vmcnt(0) 113; GFX10-NEXT: s_setpc_b64 s[30:31] 114 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 115 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 116 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef> 117 ret <4 x half> %shuffle 118} 119 120define <4 x half> @shuffle_v4f16_3u6u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 121; GFX9-LABEL: shuffle_v4f16_3u6u: 122; GFX9: ; %bb.0: 123; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 124; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 125; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 126; GFX9-NEXT: s_waitcnt vmcnt(1) 127; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 128; GFX9-NEXT: s_waitcnt vmcnt(0) 129; GFX9-NEXT: v_mov_b32_e32 v1, v4 130; GFX9-NEXT: s_setpc_b64 s[30:31] 131; 132; GFX10-LABEL: shuffle_v4f16_3u6u: 133; GFX10: ; %bb.0: 134; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 135; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 136; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 137; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 138; GFX10-NEXT: s_waitcnt vmcnt(1) 139; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v5 140; GFX10-NEXT: s_waitcnt vmcnt(0) 141; GFX10-NEXT: v_mov_b32_e32 v1, v4 142; GFX10-NEXT: s_setpc_b64 s[30:31] 143 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 144 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 145 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 6, i32 undef> 146 ret <4 x half> %shuffle 147} 148 149define <4 x half> @shuffle_v4f16_3uu7(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 150; GFX9-LABEL: shuffle_v4f16_3uu7: 151; GFX9: ; %bb.0: 152; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 153; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 154; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 155; GFX9-NEXT: s_waitcnt vmcnt(1) 156; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 157; GFX9-NEXT: s_waitcnt vmcnt(0) 158; GFX9-NEXT: v_mov_b32_e32 v1, v4 159; GFX9-NEXT: s_setpc_b64 s[30:31] 160; 161; GFX10-LABEL: shuffle_v4f16_3uu7: 162; GFX10: ; %bb.0: 163; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 164; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 165; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 166; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 167; GFX10-NEXT: s_waitcnt vmcnt(1) 168; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v5 169; GFX10-NEXT: s_waitcnt vmcnt(0) 170; GFX10-NEXT: v_mov_b32_e32 v1, v4 171; GFX10-NEXT: s_setpc_b64 s[30:31] 172 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 173 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 174 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 undef, i32 7> 175 ret <4 x half> %shuffle 176} 177 178define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 179; GFX9-LABEL: shuffle_v4f16_35u5: 180; GFX9: ; %bb.0: 181; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 182; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 183; GFX9-NEXT: global_load_dword v4, v[2:3], off 184; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 185; GFX9-NEXT: s_waitcnt vmcnt(1) 186; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 187; GFX9-NEXT: s_waitcnt vmcnt(0) 188; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v4 189; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 190; GFX9-NEXT: v_mov_b32_e32 v1, v4 191; GFX9-NEXT: s_setpc_b64 s[30:31] 192; 193; GFX10-LABEL: shuffle_v4f16_35u5: 194; GFX10: ; %bb.0: 195; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 196; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 197; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 198; GFX10-NEXT: global_load_dword v4, v[2:3], off 199; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 200; GFX10-NEXT: s_waitcnt vmcnt(1) 201; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 202; GFX10-NEXT: s_waitcnt vmcnt(0) 203; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v4 204; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 205; GFX10-NEXT: v_mov_b32_e32 v1, v4 206; GFX10-NEXT: s_setpc_b64 s[30:31] 207 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 208 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 209 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 undef, i32 5> 210 ret <4 x half> %shuffle 211} 212 213define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 214; GFX9-LABEL: shuffle_v4f16_357u: 215; GFX9: ; %bb.0: 216; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 217; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 218; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 219; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 220; GFX9-NEXT: s_waitcnt vmcnt(1) 221; GFX9-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 222; GFX9-NEXT: s_waitcnt vmcnt(0) 223; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4 224; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5 225; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 226; GFX9-NEXT: s_setpc_b64 s[30:31] 227; 228; GFX10-LABEL: shuffle_v4f16_357u: 229; GFX10: ; %bb.0: 230; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 231; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 232; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 233; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 234; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 235; GFX10-NEXT: s_waitcnt vmcnt(1) 236; GFX10-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 237; GFX10-NEXT: s_waitcnt vmcnt(0) 238; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v4 239; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 240; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v5 241; GFX10-NEXT: s_setpc_b64 s[30:31] 242 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 243 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 244 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 undef> 245 ret <4 x half> %shuffle 246} 247 248define <4 x half> @shuffle_v4f16_0101(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 249; GFX9-LABEL: shuffle_v4f16_0101: 250; GFX9: ; %bb.0: 251; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 252; GFX9-NEXT: global_load_dword v0, v[0:1], off 253; GFX9-NEXT: s_waitcnt vmcnt(0) 254; GFX9-NEXT: v_mov_b32_e32 v1, v0 255; GFX9-NEXT: s_setpc_b64 s[30:31] 256; 257; GFX10-LABEL: shuffle_v4f16_0101: 258; GFX10: ; %bb.0: 259; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 260; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 261; GFX10-NEXT: global_load_dword v0, v[0:1], off 262; GFX10-NEXT: s_waitcnt vmcnt(0) 263; GFX10-NEXT: v_mov_b32_e32 v1, v0 264; GFX10-NEXT: s_setpc_b64 s[30:31] 265 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 266 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 267 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 268 ret <4 x half> %shuffle 269} 270 271define <4 x half> @shuffle_v4f16_0123(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 272; GFX9-LABEL: shuffle_v4f16_0123: 273; GFX9: ; %bb.0: 274; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 275; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 276; GFX9-NEXT: s_waitcnt vmcnt(0) 277; GFX9-NEXT: s_setpc_b64 s[30:31] 278; 279; GFX10-LABEL: shuffle_v4f16_0123: 280; GFX10: ; %bb.0: 281; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 282; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 283; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 284; GFX10-NEXT: s_waitcnt vmcnt(0) 285; GFX10-NEXT: s_setpc_b64 s[30:31] 286 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 287 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 288 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 289 ret <4 x half> %shuffle 290} 291 292define <4 x half> @shuffle_v4f16_0145(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 293; GFX9-LABEL: shuffle_v4f16_0145: 294; GFX9: ; %bb.0: 295; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 296; GFX9-NEXT: global_load_dword v4, v[0:1], off 297; GFX9-NEXT: global_load_dword v5, v[2:3], off 298; GFX9-NEXT: s_waitcnt vmcnt(1) 299; GFX9-NEXT: v_mov_b32_e32 v0, v4 300; GFX9-NEXT: s_waitcnt vmcnt(0) 301; GFX9-NEXT: v_mov_b32_e32 v1, v5 302; GFX9-NEXT: s_setpc_b64 s[30:31] 303; 304; GFX10-LABEL: shuffle_v4f16_0145: 305; GFX10: ; %bb.0: 306; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 307; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 308; GFX10-NEXT: global_load_dword v4, v[0:1], off 309; GFX10-NEXT: global_load_dword v5, v[2:3], off 310; GFX10-NEXT: s_waitcnt vmcnt(1) 311; GFX10-NEXT: v_mov_b32_e32 v0, v4 312; GFX10-NEXT: s_waitcnt vmcnt(0) 313; GFX10-NEXT: v_mov_b32_e32 v1, v5 314; GFX10-NEXT: s_setpc_b64 s[30:31] 315 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 316 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 317 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 318 ret <4 x half> %shuffle 319} 320 321define <4 x half> @shuffle_v4f16_0167(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 322; GFX9-LABEL: shuffle_v4f16_0167: 323; GFX9: ; %bb.0: 324; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 325; GFX9-NEXT: global_load_dword v4, v[0:1], off 326; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 327; GFX9-NEXT: s_waitcnt vmcnt(1) 328; GFX9-NEXT: v_mov_b32_e32 v0, v4 329; GFX9-NEXT: s_waitcnt vmcnt(0) 330; GFX9-NEXT: v_mov_b32_e32 v1, v5 331; GFX9-NEXT: s_setpc_b64 s[30:31] 332; 333; GFX10-LABEL: shuffle_v4f16_0167: 334; GFX10: ; %bb.0: 335; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 336; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 337; GFX10-NEXT: global_load_dword v4, v[0:1], off 338; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 339; GFX10-NEXT: s_waitcnt vmcnt(1) 340; GFX10-NEXT: v_mov_b32_e32 v0, v4 341; GFX10-NEXT: s_waitcnt vmcnt(0) 342; GFX10-NEXT: v_mov_b32_e32 v1, v5 343; GFX10-NEXT: s_setpc_b64 s[30:31] 344 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 345 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 346 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 347 ret <4 x half> %shuffle 348} 349 350define <4 x half> @shuffle_v4f16_2301(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 351; GFX9-LABEL: shuffle_v4f16_2301: 352; GFX9: ; %bb.0: 353; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 354; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 355; GFX9-NEXT: s_waitcnt vmcnt(0) 356; GFX9-NEXT: v_mov_b32_e32 v0, v2 357; GFX9-NEXT: s_setpc_b64 s[30:31] 358; 359; GFX10-LABEL: shuffle_v4f16_2301: 360; GFX10: ; %bb.0: 361; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 362; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 363; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 364; GFX10-NEXT: s_waitcnt vmcnt(0) 365; GFX10-NEXT: v_mov_b32_e32 v0, v2 366; GFX10-NEXT: s_setpc_b64 s[30:31] 367 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 368 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 369 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 370 ret <4 x half> %shuffle 371} 372 373define <4 x half> @shuffle_v4f16_2323(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 374; GFX9-LABEL: shuffle_v4f16_2323: 375; GFX9: ; %bb.0: 376; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 377; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 378; GFX9-NEXT: s_waitcnt vmcnt(0) 379; GFX9-NEXT: v_mov_b32_e32 v1, v0 380; GFX9-NEXT: s_setpc_b64 s[30:31] 381; 382; GFX10-LABEL: shuffle_v4f16_2323: 383; GFX10: ; %bb.0: 384; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 385; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 386; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 387; GFX10-NEXT: s_waitcnt vmcnt(0) 388; GFX10-NEXT: v_mov_b32_e32 v1, v0 389; GFX10-NEXT: s_setpc_b64 s[30:31] 390 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 391 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 392 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 2, i32 3> 393 ret <4 x half> %shuffle 394} 395 396define <4 x half> @shuffle_v4f16_2345(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 397; GFX9-LABEL: shuffle_v4f16_2345: 398; GFX9: ; %bb.0: 399; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 400; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 401; GFX9-NEXT: global_load_dword v5, v[2:3], off 402; GFX9-NEXT: s_waitcnt vmcnt(1) 403; GFX9-NEXT: v_mov_b32_e32 v0, v4 404; GFX9-NEXT: s_waitcnt vmcnt(0) 405; GFX9-NEXT: v_mov_b32_e32 v1, v5 406; GFX9-NEXT: s_setpc_b64 s[30:31] 407; 408; GFX10-LABEL: shuffle_v4f16_2345: 409; GFX10: ; %bb.0: 410; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 411; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 412; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 413; GFX10-NEXT: global_load_dword v5, v[2:3], off 414; GFX10-NEXT: s_waitcnt vmcnt(1) 415; GFX10-NEXT: v_mov_b32_e32 v0, v4 416; GFX10-NEXT: s_waitcnt vmcnt(0) 417; GFX10-NEXT: v_mov_b32_e32 v1, v5 418; GFX10-NEXT: s_setpc_b64 s[30:31] 419 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 420 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 421 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 422 ret <4 x half> %shuffle 423} 424 425define <4 x half> @shuffle_v4f16_2367(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 426; GFX9-LABEL: shuffle_v4f16_2367: 427; GFX9: ; %bb.0: 428; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 429; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 430; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 431; GFX9-NEXT: s_waitcnt vmcnt(1) 432; GFX9-NEXT: v_mov_b32_e32 v0, v4 433; GFX9-NEXT: s_waitcnt vmcnt(0) 434; GFX9-NEXT: v_mov_b32_e32 v1, v5 435; GFX9-NEXT: s_setpc_b64 s[30:31] 436; 437; GFX10-LABEL: shuffle_v4f16_2367: 438; GFX10: ; %bb.0: 439; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 440; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 441; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 442; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 443; GFX10-NEXT: s_waitcnt vmcnt(1) 444; GFX10-NEXT: v_mov_b32_e32 v0, v4 445; GFX10-NEXT: s_waitcnt vmcnt(0) 446; GFX10-NEXT: v_mov_b32_e32 v1, v5 447; GFX10-NEXT: s_setpc_b64 s[30:31] 448 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 449 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 450 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 451 ret <4 x half> %shuffle 452} 453 454define <4 x half> @shuffle_v4f16_4501(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 455; GFX9-LABEL: shuffle_v4f16_4501: 456; GFX9: ; %bb.0: 457; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 458; GFX9-NEXT: global_load_dword v4, v[2:3], off 459; GFX9-NEXT: global_load_dword v5, v[0:1], off 460; GFX9-NEXT: s_waitcnt vmcnt(1) 461; GFX9-NEXT: v_mov_b32_e32 v0, v4 462; GFX9-NEXT: s_waitcnt vmcnt(0) 463; GFX9-NEXT: v_mov_b32_e32 v1, v5 464; GFX9-NEXT: s_setpc_b64 s[30:31] 465; 466; GFX10-LABEL: shuffle_v4f16_4501: 467; GFX10: ; %bb.0: 468; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 469; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 470; GFX10-NEXT: global_load_dword v4, v[2:3], off 471; GFX10-NEXT: global_load_dword v5, v[0:1], off 472; GFX10-NEXT: s_waitcnt vmcnt(1) 473; GFX10-NEXT: v_mov_b32_e32 v0, v4 474; GFX10-NEXT: s_waitcnt vmcnt(0) 475; GFX10-NEXT: v_mov_b32_e32 v1, v5 476; GFX10-NEXT: s_setpc_b64 s[30:31] 477 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 478 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 479 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 480 ret <4 x half> %shuffle 481} 482 483define <4 x half> @shuffle_v4f16_4523(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 484; GFX9-LABEL: shuffle_v4f16_4523: 485; GFX9: ; %bb.0: 486; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 487; GFX9-NEXT: global_load_dword v4, v[2:3], off 488; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 489; GFX9-NEXT: s_waitcnt vmcnt(1) 490; GFX9-NEXT: v_mov_b32_e32 v0, v4 491; GFX9-NEXT: s_waitcnt vmcnt(0) 492; GFX9-NEXT: v_mov_b32_e32 v1, v5 493; GFX9-NEXT: s_setpc_b64 s[30:31] 494; 495; GFX10-LABEL: shuffle_v4f16_4523: 496; GFX10: ; %bb.0: 497; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 498; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 499; GFX10-NEXT: global_load_dword v4, v[2:3], off 500; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 501; GFX10-NEXT: s_waitcnt vmcnt(1) 502; GFX10-NEXT: v_mov_b32_e32 v0, v4 503; GFX10-NEXT: s_waitcnt vmcnt(0) 504; GFX10-NEXT: v_mov_b32_e32 v1, v5 505; GFX10-NEXT: s_setpc_b64 s[30:31] 506 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 507 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 508 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 509 ret <4 x half> %shuffle 510} 511 512define <4 x half> @shuffle_v4f16_4545(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 513; GFX9-LABEL: shuffle_v4f16_4545: 514; GFX9: ; %bb.0: 515; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 516; GFX9-NEXT: global_load_dword v0, v[2:3], off 517; GFX9-NEXT: s_waitcnt vmcnt(0) 518; GFX9-NEXT: v_mov_b32_e32 v1, v0 519; GFX9-NEXT: s_setpc_b64 s[30:31] 520; 521; GFX10-LABEL: shuffle_v4f16_4545: 522; GFX10: ; %bb.0: 523; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 524; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 525; GFX10-NEXT: global_load_dword v0, v[2:3], off 526; GFX10-NEXT: s_waitcnt vmcnt(0) 527; GFX10-NEXT: v_mov_b32_e32 v1, v0 528; GFX10-NEXT: s_setpc_b64 s[30:31] 529 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 530 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 531 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 4, i32 5> 532 ret <4 x half> %shuffle 533} 534 535define <4 x half> @shuffle_v4f16_4567(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 536; GFX9-LABEL: shuffle_v4f16_4567: 537; GFX9: ; %bb.0: 538; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 539; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off 540; GFX9-NEXT: s_waitcnt vmcnt(0) 541; GFX9-NEXT: s_setpc_b64 s[30:31] 542; 543; GFX10-LABEL: shuffle_v4f16_4567: 544; GFX10: ; %bb.0: 545; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 546; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 547; GFX10-NEXT: global_load_dwordx2 v[0:1], v[2:3], off 548; GFX10-NEXT: s_waitcnt vmcnt(0) 549; GFX10-NEXT: s_setpc_b64 s[30:31] 550 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 551 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 552 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 553 ret <4 x half> %shuffle 554} 555 556define <4 x half> @shuffle_v4f16_6701(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 557; GFX9-LABEL: shuffle_v4f16_6701: 558; GFX9: ; %bb.0: 559; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 560; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 561; GFX9-NEXT: global_load_dword v5, v[0:1], off 562; GFX9-NEXT: s_waitcnt vmcnt(1) 563; GFX9-NEXT: v_mov_b32_e32 v0, v4 564; GFX9-NEXT: s_waitcnt vmcnt(0) 565; GFX9-NEXT: v_mov_b32_e32 v1, v5 566; GFX9-NEXT: s_setpc_b64 s[30:31] 567; 568; GFX10-LABEL: shuffle_v4f16_6701: 569; GFX10: ; %bb.0: 570; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 571; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 572; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 573; GFX10-NEXT: global_load_dword v5, v[0:1], off 574; GFX10-NEXT: s_waitcnt vmcnt(1) 575; GFX10-NEXT: v_mov_b32_e32 v0, v4 576; GFX10-NEXT: s_waitcnt vmcnt(0) 577; GFX10-NEXT: v_mov_b32_e32 v1, v5 578; GFX10-NEXT: s_setpc_b64 s[30:31] 579 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 580 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 581 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 582 ret <4 x half> %shuffle 583} 584 585define <4 x half> @shuffle_v4f16_6723(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 586; GFX9-LABEL: shuffle_v4f16_6723: 587; GFX9: ; %bb.0: 588; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 589; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 590; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 591; GFX9-NEXT: s_waitcnt vmcnt(1) 592; GFX9-NEXT: v_mov_b32_e32 v0, v4 593; GFX9-NEXT: s_waitcnt vmcnt(0) 594; GFX9-NEXT: v_mov_b32_e32 v1, v5 595; GFX9-NEXT: s_setpc_b64 s[30:31] 596; 597; GFX10-LABEL: shuffle_v4f16_6723: 598; GFX10: ; %bb.0: 599; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 600; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 601; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 602; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 603; GFX10-NEXT: s_waitcnt vmcnt(1) 604; GFX10-NEXT: v_mov_b32_e32 v0, v4 605; GFX10-NEXT: s_waitcnt vmcnt(0) 606; GFX10-NEXT: v_mov_b32_e32 v1, v5 607; GFX10-NEXT: s_setpc_b64 s[30:31] 608 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 609 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 610 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 2, i32 3> 611 ret <4 x half> %shuffle 612} 613 614define <4 x half> @shuffle_v4f16_6745(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 615; GFX9-LABEL: shuffle_v4f16_6745: 616; GFX9: ; %bb.0: 617; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 618; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off 619; GFX9-NEXT: s_waitcnt vmcnt(0) 620; GFX9-NEXT: v_mov_b32_e32 v0, v2 621; GFX9-NEXT: s_setpc_b64 s[30:31] 622; 623; GFX10-LABEL: shuffle_v4f16_6745: 624; GFX10: ; %bb.0: 625; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 626; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 627; GFX10-NEXT: global_load_dwordx2 v[1:2], v[2:3], off 628; GFX10-NEXT: s_waitcnt vmcnt(0) 629; GFX10-NEXT: v_mov_b32_e32 v0, v2 630; GFX10-NEXT: s_setpc_b64 s[30:31] 631 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 632 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 633 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 4, i32 5> 634 ret <4 x half> %shuffle 635} 636 637define <4 x half> @shuffle_v4f16_6767(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 638; GFX9-LABEL: shuffle_v4f16_6767: 639; GFX9: ; %bb.0: 640; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 641; GFX9-NEXT: global_load_dword v0, v[2:3], off offset:4 642; GFX9-NEXT: s_waitcnt vmcnt(0) 643; GFX9-NEXT: v_mov_b32_e32 v1, v0 644; GFX9-NEXT: s_setpc_b64 s[30:31] 645; 646; GFX10-LABEL: shuffle_v4f16_6767: 647; GFX10: ; %bb.0: 648; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 649; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 650; GFX10-NEXT: global_load_dword v0, v[2:3], off offset:4 651; GFX10-NEXT: s_waitcnt vmcnt(0) 652; GFX10-NEXT: v_mov_b32_e32 v1, v0 653; GFX10-NEXT: s_setpc_b64 s[30:31] 654 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 655 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 656 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 6, i32 7> 657 ret <4 x half> %shuffle 658} 659 660define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 661; GFX9-LABEL: shuffle_v4f16_2356: 662; GFX9: ; %bb.0: 663; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 664; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 665; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 666; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 667; GFX9-NEXT: s_waitcnt vmcnt(1) 668; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 669; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v0 670; GFX9-NEXT: s_waitcnt vmcnt(0) 671; GFX9-NEXT: v_mov_b32_e32 v0, v4 672; GFX9-NEXT: s_setpc_b64 s[30:31] 673; 674; GFX10-LABEL: shuffle_v4f16_2356: 675; GFX10: ; %bb.0: 676; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 677; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 678; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 679; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 680; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 681; GFX10-NEXT: s_waitcnt vmcnt(1) 682; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 683; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v0 684; GFX10-NEXT: s_waitcnt vmcnt(0) 685; GFX10-NEXT: v_mov_b32_e32 v0, v4 686; GFX10-NEXT: s_setpc_b64 s[30:31] 687 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 688 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 689 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6> 690 ret <4 x half> %shuffle 691} 692 693define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 694; GFX9-LABEL: shuffle_v4f16_5623: 695; GFX9: ; %bb.0: 696; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 697; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 698; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 699; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 700; GFX9-NEXT: s_waitcnt vmcnt(1) 701; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 702; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 703; GFX9-NEXT: s_waitcnt vmcnt(0) 704; GFX9-NEXT: v_mov_b32_e32 v1, v4 705; GFX9-NEXT: s_setpc_b64 s[30:31] 706; 707; GFX10-LABEL: shuffle_v4f16_5623: 708; GFX10: ; %bb.0: 709; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 710; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 711; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 712; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 713; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 714; GFX10-NEXT: s_waitcnt vmcnt(1) 715; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 716; GFX10-NEXT: s_waitcnt vmcnt(0) 717; GFX10-NEXT: v_mov_b32_e32 v1, v4 718; GFX10-NEXT: v_lshl_or_b32 v0, v6, 16, v0 719; GFX10-NEXT: s_setpc_b64 s[30:31] 720 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 721 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 722 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3> 723 ret <4 x half> %shuffle 724} 725 726define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 727; GFX9-LABEL: shuffle_v4f16_3456: 728; GFX9: ; %bb.0: 729; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 730; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 731; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 732; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 733; GFX9-NEXT: s_waitcnt vmcnt(1) 734; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 735; GFX9-NEXT: s_waitcnt vmcnt(0) 736; GFX9-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 737; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v1 738; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v2 739; GFX9-NEXT: s_setpc_b64 s[30:31] 740; 741; GFX10-LABEL: shuffle_v4f16_3456: 742; GFX10: ; %bb.0: 743; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 744; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 745; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 746; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 747; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 748; GFX10-NEXT: s_waitcnt vmcnt(1) 749; GFX10-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 750; GFX10-NEXT: s_waitcnt vmcnt(0) 751; GFX10-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 752; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v1 753; GFX10-NEXT: v_lshl_or_b32 v1, v5, 16, v2 754; GFX10-NEXT: s_setpc_b64 s[30:31] 755 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 756 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 757 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6> 758 ret <4 x half> %shuffle 759} 760 761define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 762; GFX9-LABEL: shuffle_v4f16_5634: 763; GFX9: ; %bb.0: 764; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 765; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 766; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 767; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 768; GFX9-NEXT: s_waitcnt vmcnt(1) 769; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 770; GFX9-NEXT: s_waitcnt vmcnt(0) 771; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 772; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 773; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 774; GFX9-NEXT: s_setpc_b64 s[30:31] 775; 776; GFX10-LABEL: shuffle_v4f16_5634: 777; GFX10: ; %bb.0: 778; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 779; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 780; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 781; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 782; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 783; GFX10-NEXT: s_waitcnt vmcnt(1) 784; GFX10-NEXT: v_and_b32_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 785; GFX10-NEXT: s_waitcnt vmcnt(0) 786; GFX10-NEXT: v_and_b32_sdwa v2, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 787; GFX10-NEXT: v_lshl_or_b32 v0, v5, 16, v1 788; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v2 789; GFX10-NEXT: s_setpc_b64 s[30:31] 790 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 791 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 792 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4> 793 ret <4 x half> %shuffle 794} 795 796define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 797; GFX9-LABEL: shuffle_v4f16_5734: 798; GFX9: ; %bb.0: 799; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 800; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 801; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 802; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 803; GFX9-NEXT: s_waitcnt vmcnt(1) 804; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 805; GFX9-NEXT: s_waitcnt vmcnt(0) 806; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 807; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v5 808; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 809; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 810; GFX9-NEXT: s_setpc_b64 s[30:31] 811; 812; GFX10-LABEL: shuffle_v4f16_5734: 813; GFX10: ; %bb.0: 814; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 815; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 816; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 817; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 818; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 819; GFX10-NEXT: s_waitcnt vmcnt(1) 820; GFX10-NEXT: v_and_b32_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 821; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v5 822; GFX10-NEXT: s_waitcnt vmcnt(0) 823; GFX10-NEXT: v_and_b32_sdwa v3, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 824; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 825; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 826; GFX10-NEXT: s_setpc_b64 s[30:31] 827 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 828 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 829 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4> 830 ret <4 x half> %shuffle 831} 832 833define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) { 834; GFX9-LABEL: shuffle_v4i16_2356: 835; GFX9: ; %bb.0: 836; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 837; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 838; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 839; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 840; GFX9-NEXT: s_waitcnt vmcnt(1) 841; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 842; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v0 843; GFX9-NEXT: s_waitcnt vmcnt(0) 844; GFX9-NEXT: v_mov_b32_e32 v0, v4 845; GFX9-NEXT: s_setpc_b64 s[30:31] 846; 847; GFX10-LABEL: shuffle_v4i16_2356: 848; GFX10: ; %bb.0: 849; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 850; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 851; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 852; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 853; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 854; GFX10-NEXT: s_waitcnt vmcnt(1) 855; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 856; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v0 857; GFX10-NEXT: s_waitcnt vmcnt(0) 858; GFX10-NEXT: v_mov_b32_e32 v0, v4 859; GFX10-NEXT: s_setpc_b64 s[30:31] 860 %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 861 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 862 %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6> 863 ret <4 x i16> %shuffle 864} 865 866define <4 x i16> @shuffle_v4i16_0167(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) { 867; GFX9-LABEL: shuffle_v4i16_0167: 868; GFX9: ; %bb.0: 869; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 870; GFX9-NEXT: global_load_dword v4, v[0:1], off 871; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 872; GFX9-NEXT: s_waitcnt vmcnt(1) 873; GFX9-NEXT: v_mov_b32_e32 v0, v4 874; GFX9-NEXT: s_waitcnt vmcnt(0) 875; GFX9-NEXT: v_mov_b32_e32 v1, v5 876; GFX9-NEXT: s_setpc_b64 s[30:31] 877; 878; GFX10-LABEL: shuffle_v4i16_0167: 879; GFX10: ; %bb.0: 880; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 881; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 882; GFX10-NEXT: global_load_dword v4, v[0:1], off 883; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 884; GFX10-NEXT: s_waitcnt vmcnt(1) 885; GFX10-NEXT: v_mov_b32_e32 v0, v4 886; GFX10-NEXT: s_waitcnt vmcnt(0) 887; GFX10-NEXT: v_mov_b32_e32 v1, v5 888; GFX10-NEXT: s_setpc_b64 s[30:31] 889 %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 890 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 891 %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 892 ret <4 x i16> %shuffle 893} 894 895define <4 x half> @shuffle_v4f16_0000(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 896; GFX9-LABEL: shuffle_v4f16_0000: 897; GFX9: ; %bb.0: 898; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 899; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 900; GFX9-NEXT: s_waitcnt vmcnt(0) 901; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 902; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 903; GFX9-NEXT: v_mov_b32_e32 v1, v0 904; GFX9-NEXT: s_setpc_b64 s[30:31] 905; 906; GFX10-LABEL: shuffle_v4f16_0000: 907; GFX10: ; %bb.0: 908; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 909; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 910; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 911; GFX10-NEXT: s_waitcnt vmcnt(0) 912; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v0 913; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 914; GFX10-NEXT: v_mov_b32_e32 v1, v0 915; GFX10-NEXT: s_setpc_b64 s[30:31] 916 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 917 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 918 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> zeroinitializer 919 ret <4 x half> %shuffle 920} 921 922define <4 x half> @shuffle_v4f16_1010(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 923; GFX9-LABEL: shuffle_v4f16_1010: 924; GFX9: ; %bb.0: 925; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 926; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 927; GFX9-NEXT: s_waitcnt vmcnt(0) 928; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 929; GFX9-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 930; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 931; GFX9-NEXT: v_mov_b32_e32 v1, v0 932; GFX9-NEXT: s_setpc_b64 s[30:31] 933; 934; GFX10-LABEL: shuffle_v4f16_1010: 935; GFX10: ; %bb.0: 936; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 937; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 938; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 939; GFX10-NEXT: s_waitcnt vmcnt(0) 940; GFX10-NEXT: v_mov_b32_e32 v1, 0xffff 941; GFX10-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 942; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 943; GFX10-NEXT: v_mov_b32_e32 v1, v0 944; GFX10-NEXT: s_setpc_b64 s[30:31] 945 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 946 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 947 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 0, i32 1, i32 0> 948 ret <4 x half> %shuffle 949} 950 951define <4 x half> @shuffle_v4f16_1100(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 952; GFX9-LABEL: shuffle_v4f16_1100: 953; GFX9: ; %bb.0: 954; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 955; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 956; GFX9-NEXT: s_waitcnt vmcnt(0) 957; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 958; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 959; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 960; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 961; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 962; GFX9-NEXT: s_setpc_b64 s[30:31] 963; 964; GFX10-LABEL: shuffle_v4f16_1100: 965; GFX10: ; %bb.0: 966; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 967; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 968; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 969; GFX10-NEXT: s_waitcnt vmcnt(0) 970; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v1 971; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v1 972; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v0 973; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 974; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 975; GFX10-NEXT: s_setpc_b64 s[30:31] 976 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 977 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 978 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0> 979 ret <4 x half> %shuffle 980} 981 982define <4 x half> @shuffle_v4f16_6161(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 983; GFX9-LABEL: shuffle_v4f16_6161: 984; GFX9: ; %bb.0: 985; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 986; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 987; GFX9-NEXT: global_load_dword v5, v[0:1], off 988; GFX9-NEXT: s_waitcnt vmcnt(1) 989; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 990; GFX9-NEXT: s_waitcnt vmcnt(0) 991; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5 992; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 993; GFX9-NEXT: v_mov_b32_e32 v1, v0 994; GFX9-NEXT: s_setpc_b64 s[30:31] 995; 996; GFX10-LABEL: shuffle_v4f16_6161: 997; GFX10: ; %bb.0: 998; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 999; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1000; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 1001; GFX10-NEXT: global_load_dword v5, v[0:1], off 1002; GFX10-NEXT: s_waitcnt vmcnt(1) 1003; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 1004; GFX10-NEXT: s_waitcnt vmcnt(0) 1005; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v5 1006; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 1007; GFX10-NEXT: v_mov_b32_e32 v1, v0 1008; GFX10-NEXT: s_setpc_b64 s[30:31] 1009 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 1010 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 1011 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1> 1012 ret <4 x half> %shuffle 1013} 1014 1015define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 1016; GFX9-LABEL: shuffle_v4f16_2333: 1017; GFX9: ; %bb.0: 1018; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1019; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 1020; GFX9-NEXT: s_waitcnt vmcnt(0) 1021; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1022; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 1023; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 1024; GFX9-NEXT: s_setpc_b64 s[30:31] 1025; 1026; GFX10-LABEL: shuffle_v4f16_2333: 1027; GFX10: ; %bb.0: 1028; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1029; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1030; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 1031; GFX10-NEXT: s_waitcnt vmcnt(0) 1032; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1033; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v1 1034; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 1035; GFX10-NEXT: s_setpc_b64 s[30:31] 1036 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 1037 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 1038 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3> 1039 ret <4 x half> %shuffle 1040} 1041 1042define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 1043; GFX9-LABEL: shuffle_v4f16_6667: 1044; GFX9: ; %bb.0: 1045; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1046; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 1047; GFX9-NEXT: s_waitcnt vmcnt(0) 1048; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1049; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 1050; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 1051; GFX9-NEXT: s_setpc_b64 s[30:31] 1052; 1053; GFX10-LABEL: shuffle_v4f16_6667: 1054; GFX10: ; %bb.0: 1055; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1056; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1057; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 1058; GFX10-NEXT: s_waitcnt vmcnt(0) 1059; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1060; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v1 1061; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 1062; GFX10-NEXT: s_setpc_b64 s[30:31] 1063 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 1064 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 1065 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3> 1066 ret <4 x half> %shuffle 1067} 1068 1069define <4 x half> @shuffle_v8f16_0101(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 1070; GFX9-LABEL: shuffle_v8f16_0101: 1071; GFX9: ; %bb.0: 1072; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1073; GFX9-NEXT: global_load_dword v0, v[0:1], off 1074; GFX9-NEXT: s_waitcnt vmcnt(0) 1075; GFX9-NEXT: v_mov_b32_e32 v1, v0 1076; GFX9-NEXT: s_setpc_b64 s[30:31] 1077; 1078; GFX10-LABEL: shuffle_v8f16_0101: 1079; GFX10: ; %bb.0: 1080; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1081; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1082; GFX10-NEXT: global_load_dword v0, v[0:1], off 1083; GFX10-NEXT: s_waitcnt vmcnt(0) 1084; GFX10-NEXT: v_mov_b32_e32 v1, v0 1085; GFX10-NEXT: s_setpc_b64 s[30:31] 1086 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 1087 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 1088 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 1089 ret <4 x half> %shuffle 1090} 1091 1092define <4 x half> @shuffle_v8f16_0123(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 1093; GFX9-LABEL: shuffle_v8f16_0123: 1094; GFX9: ; %bb.0: 1095; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1096; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1097; GFX9-NEXT: s_waitcnt vmcnt(0) 1098; GFX9-NEXT: s_setpc_b64 s[30:31] 1099; 1100; GFX10-LABEL: shuffle_v8f16_0123: 1101; GFX10: ; %bb.0: 1102; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1103; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1104; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1105; GFX10-NEXT: s_waitcnt vmcnt(0) 1106; GFX10-NEXT: s_setpc_b64 s[30:31] 1107 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 1108 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 1109 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1110 ret <4 x half> %shuffle 1111} 1112 1113define <4 x half> @shuffle_v8f16_4589(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 1114; GFX9-LABEL: shuffle_v8f16_4589: 1115; GFX9: ; %bb.0: 1116; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1117; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:8 1118; GFX9-NEXT: global_load_dword v5, v[2:3], off 1119; GFX9-NEXT: s_waitcnt vmcnt(1) 1120; GFX9-NEXT: v_mov_b32_e32 v0, v4 1121; GFX9-NEXT: s_waitcnt vmcnt(0) 1122; GFX9-NEXT: v_mov_b32_e32 v1, v5 1123; GFX9-NEXT: s_setpc_b64 s[30:31] 1124; 1125; GFX10-LABEL: shuffle_v8f16_4589: 1126; GFX10: ; %bb.0: 1127; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1128; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1129; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:8 1130; GFX10-NEXT: global_load_dword v5, v[2:3], off 1131; GFX10-NEXT: s_waitcnt vmcnt(1) 1132; GFX10-NEXT: v_mov_b32_e32 v0, v4 1133; GFX10-NEXT: s_waitcnt vmcnt(0) 1134; GFX10-NEXT: v_mov_b32_e32 v1, v5 1135; GFX10-NEXT: s_setpc_b64 s[30:31] 1136 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 1137 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 1138 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 4, i32 5, i32 8, i32 9> 1139 ret <4 x half> %shuffle 1140} 1141 1142define <4 x half> @shuffle_v8f16_10_11_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 1143; GFX9-LABEL: shuffle_v8f16_10_11_2_3: 1144; GFX9: ; %bb.0: 1145; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1146; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 1147; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 1148; GFX9-NEXT: s_waitcnt vmcnt(1) 1149; GFX9-NEXT: v_mov_b32_e32 v0, v4 1150; GFX9-NEXT: s_waitcnt vmcnt(0) 1151; GFX9-NEXT: v_mov_b32_e32 v1, v5 1152; GFX9-NEXT: s_setpc_b64 s[30:31] 1153; 1154; GFX10-LABEL: shuffle_v8f16_10_11_2_3: 1155; GFX10: ; %bb.0: 1156; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1157; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1158; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 1159; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 1160; GFX10-NEXT: s_waitcnt vmcnt(1) 1161; GFX10-NEXT: v_mov_b32_e32 v0, v4 1162; GFX10-NEXT: s_waitcnt vmcnt(0) 1163; GFX10-NEXT: v_mov_b32_e32 v1, v5 1164; GFX10-NEXT: s_setpc_b64 s[30:31] 1165 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 1166 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 1167 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 10, i32 11, i32 2, i32 3> 1168 ret <4 x half> %shuffle 1169} 1170 1171define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 1172; GFX9-LABEL: shuffle_v8f16_13_14_2_3: 1173; GFX9: ; %bb.0: 1174; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1175; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8 1176; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 1177; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 1178; GFX9-NEXT: s_waitcnt vmcnt(1) 1179; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1180; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 1181; GFX9-NEXT: s_waitcnt vmcnt(0) 1182; GFX9-NEXT: v_mov_b32_e32 v1, v4 1183; GFX9-NEXT: s_setpc_b64 s[30:31] 1184; 1185; GFX10-LABEL: shuffle_v8f16_13_14_2_3: 1186; GFX10: ; %bb.0: 1187; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1188; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1189; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8 1190; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 1191; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 1192; GFX10-NEXT: s_waitcnt vmcnt(1) 1193; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1194; GFX10-NEXT: s_waitcnt vmcnt(0) 1195; GFX10-NEXT: v_mov_b32_e32 v1, v4 1196; GFX10-NEXT: v_lshl_or_b32 v0, v6, 16, v0 1197; GFX10-NEXT: s_setpc_b64 s[30:31] 1198 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 1199 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 1200 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3> 1201 ret <4 x half> %shuffle 1202} 1203 1204define <4 x half> @shuffle_v3f16_0122(<3 x half> addrspace(1)* %arg0, <3 x half> addrspace(1)* %arg1) { 1205; GFX9-LABEL: shuffle_v3f16_0122: 1206; GFX9: ; %bb.0: 1207; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1208; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1209; GFX9-NEXT: s_waitcnt vmcnt(0) 1210; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 1211; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 1212; GFX9-NEXT: s_setpc_b64 s[30:31] 1213; 1214; GFX10-LABEL: shuffle_v3f16_0122: 1215; GFX10: ; %bb.0: 1216; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1217; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1218; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1219; GFX10-NEXT: s_waitcnt vmcnt(0) 1220; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v1 1221; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 1222; GFX10-NEXT: s_setpc_b64 s[30:31] 1223 %val0 = load <3 x half>, <3 x half> addrspace(1)* %arg0 1224 %val1 = load <3 x half>, <3 x half> addrspace(1)* %arg1 1225 %shuffle = shufflevector <3 x half> %val0, <3 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 1226 ret <4 x half> %shuffle 1227} 1228 1229define <4 x half> @shuffle_v2f16_0122(<2 x half> addrspace(1)* %arg0, <2 x half> addrspace(1)* %arg1) { 1230; GFX9-LABEL: shuffle_v2f16_0122: 1231; GFX9: ; %bb.0: 1232; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1233; GFX9-NEXT: global_load_dword v0, v[0:1], off 1234; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 1235; GFX9-NEXT: s_waitcnt vmcnt(0) 1236; GFX9-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1237; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 1238; GFX9-NEXT: s_setpc_b64 s[30:31] 1239; 1240; GFX10-LABEL: shuffle_v2f16_0122: 1241; GFX10: ; %bb.0: 1242; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1243; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1244; GFX10-NEXT: global_load_dword v0, v[0:1], off 1245; GFX10-NEXT: v_mov_b32_e32 v1, 0xffff 1246; GFX10-NEXT: s_waitcnt vmcnt(0) 1247; GFX10-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1248; GFX10-NEXT: v_lshl_or_b32 v1, v0, 16, v1 1249; GFX10-NEXT: s_setpc_b64 s[30:31] 1250 %val0 = load <2 x half>, <2 x half> addrspace(1)* %arg0 1251 %val1 = load <2 x half>, <2 x half> addrspace(1)* %arg1 1252 %shuffle = shufflevector <2 x half> %val0, <2 x half> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0> 1253 ret <4 x half> %shuffle 1254} 1255 1256define <6 x half> @shuffle_v6f16_452367(<6 x half> addrspace(1)* %arg0, <6 x half> addrspace(1)* %arg1) { 1257; GFX9-LABEL: shuffle_v6f16_452367: 1258; GFX9: ; %bb.0: 1259; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1260; GFX9-NEXT: v_mov_b32_e32 v6, v1 1261; GFX9-NEXT: v_mov_b32_e32 v5, v0 1262; GFX9-NEXT: v_mov_b32_e32 v4, v3 1263; GFX9-NEXT: v_mov_b32_e32 v3, v2 1264; GFX9-NEXT: global_load_dwordx3 v[0:2], v[5:6], off 1265; GFX9-NEXT: global_load_dword v7, v[3:4], off 1266; GFX9-NEXT: s_waitcnt vmcnt(1) 1267; GFX9-NEXT: v_mov_b32_e32 v0, v2 1268; GFX9-NEXT: s_waitcnt vmcnt(0) 1269; GFX9-NEXT: v_mov_b32_e32 v2, v7 1270; GFX9-NEXT: s_setpc_b64 s[30:31] 1271; 1272; GFX10-LABEL: shuffle_v6f16_452367: 1273; GFX10: ; %bb.0: 1274; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1275; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1276; GFX10-NEXT: v_mov_b32_e32 v6, v1 1277; GFX10-NEXT: v_mov_b32_e32 v5, v0 1278; GFX10-NEXT: v_mov_b32_e32 v4, v3 1279; GFX10-NEXT: v_mov_b32_e32 v3, v2 1280; GFX10-NEXT: global_load_dwordx3 v[0:2], v[5:6], off 1281; GFX10-NEXT: global_load_dword v7, v[3:4], off 1282; GFX10-NEXT: s_waitcnt vmcnt(1) 1283; GFX10-NEXT: v_mov_b32_e32 v0, v2 1284; GFX10-NEXT: s_waitcnt vmcnt(0) 1285; GFX10-NEXT: v_mov_b32_e32 v2, v7 1286; GFX10-NEXT: s_setpc_b64 s[30:31] 1287 %val0 = load <6 x half>, <6 x half> addrspace(1)* %arg0 1288 %val1 = load <6 x half>, <6 x half> addrspace(1)* %arg1 1289 %shuffle = shufflevector <6 x half> %val0, <6 x half> %val1, <6 x i32> <i32 4, i32 5, i32 2, i32 3, i32 6, i32 7> 1290 ret <6 x half> %shuffle 1291} 1292 1293define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readonly %A, <4 x half> addrspace(1)* nocapture readonly %B, <4 x half> addrspace(1)* nocapture %C) { 1294; GFX9-LABEL: fma_shuffle: 1295; GFX9: ; %bb.0: ; %entry 1296; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1297; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 1298; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0 1299; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1300; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] 1301; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] 1302; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7] 1303; GFX9-NEXT: s_waitcnt vmcnt(0) 1304; GFX9-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] 1305; GFX9-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] 1306; GFX9-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] 1307; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] 1308; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] 1309; GFX9-NEXT: s_endpgm 1310; 1311; GFX10-LABEL: fma_shuffle: 1312; GFX10: ; %bb.0: ; %entry 1313; GFX10-NEXT: s_clause 0x1 1314; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1315; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 1316; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 1317; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1318; GFX10-NEXT: s_clause 0x2 1319; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] 1320; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] 1321; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7] 1322; GFX10-NEXT: s_waitcnt vmcnt(0) 1323; GFX10-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] 1324; GFX10-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] 1325; GFX10-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] 1326; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] 1327; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] 1328; GFX10-NEXT: s_endpgm 1329entry: 1330 %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() 1331 %tmp12 = zext i32 %tmp1 to i64 1332 %arrayidx = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %A, i64 %tmp12 1333 %tmp14 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx, align 8 1334 %arrayidx1 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %B, i64 %tmp12 1335 %tmp15 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx1, align 8 1336 %arrayidx2 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %C, i64 %tmp12 1337 %tmp16 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx2, align 8 1338 %tmp17 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> zeroinitializer 1339 %tmp18 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 0, i32 1> 1340 %tmp19 = shufflevector <4 x half> %tmp16, <4 x half> undef, <2 x i32> <i32 0, i32 1> 1341 %tmp20 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp17, <2 x half> %tmp18, <2 x half> %tmp19) 1342 %tmp21 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 1, i32 1> 1343 %tmp22 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 2, i32 3> 1344 %tmp23 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp21, <2 x half> %tmp22, <2 x half> %tmp20) 1345 %tmp24 = shufflevector <2 x half> %tmp23, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1346 %tmp25 = shufflevector <4 x half> %tmp24, <4 x half> %tmp16, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 1347 %tmp26 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 2, i32 2> 1348 %tmp27 = shufflevector <4 x half> %tmp25, <4 x half> undef, <2 x i32> <i32 2, i32 3> 1349 %tmp28 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp26, <2 x half> %tmp18, <2 x half> %tmp27) 1350 %tmp29 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 3, i32 3> 1351 %tmp30 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp29, <2 x half> %tmp22, <2 x half> %tmp28) 1352 %tmp31 = shufflevector <2 x half> %tmp30, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1353 %tmp32 = shufflevector <4 x half> %tmp25, <4 x half> %tmp31, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1354 store <4 x half> %tmp32, <4 x half> addrspace(1)* %arrayidx2, align 8 1355 ret void 1356} 1357 1358define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 1359; GFX9-LABEL: shuffle_v4f16_0456: 1360; GFX9: ; %bb.0: 1361; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1362; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off 1363; GFX9-NEXT: s_waitcnt vmcnt(0) 1364; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 1365; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 1366; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 1367; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 1368; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v4 1369; GFX9-NEXT: s_waitcnt vmcnt(0) 1370; GFX9-NEXT: v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1371; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v1 1372; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v2 1373; GFX9-NEXT: s_setpc_b64 s[30:31] 1374; 1375; GFX10-LABEL: shuffle_v4f16_0456: 1376; GFX10: ; %bb.0: 1377; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1378; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1379; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off 1380; GFX10-NEXT: s_waitcnt vmcnt(0) 1381; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 1382; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1 1383; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 1384; GFX10-NEXT: ; kill: killed $vgpr2 killed $vgpr3 1385; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4 1386; GFX10-NEXT: s_waitcnt vmcnt(0) 1387; GFX10-NEXT: v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1388; GFX10-NEXT: v_lshl_or_b32 v0, v5, 16, v1 1389; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v2 1390; GFX10-NEXT: s_setpc_b64 s[30:31] 1391 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 1392 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 1393 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6> 1394 ret <4 x half> %shuffle 1395} 1396 1397define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(<8 x i32> addrspace(4)* %in, <4 x i32> addrspace(1)* %out) { 1398; GFX9-LABEL: shuffle_scalar_load_v8i32_0123: 1399; GFX9: ; %bb.0: 1400; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1401; GFX9-NEXT: v_mov_b32_e32 v4, 0 1402; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1403; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 1404; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1405; GFX9-NEXT: v_mov_b32_e32 v0, s4 1406; GFX9-NEXT: v_mov_b32_e32 v1, s5 1407; GFX9-NEXT: v_mov_b32_e32 v2, s6 1408; GFX9-NEXT: v_mov_b32_e32 v3, s7 1409; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 1410; GFX9-NEXT: s_endpgm 1411; 1412; GFX10-LABEL: shuffle_scalar_load_v8i32_0123: 1413; GFX10: ; %bb.0: 1414; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1415; GFX10-NEXT: v_mov_b32_e32 v4, 0 1416; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1417; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 1418; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1419; GFX10-NEXT: v_mov_b32_e32 v0, s4 1420; GFX10-NEXT: v_mov_b32_e32 v1, s5 1421; GFX10-NEXT: v_mov_b32_e32 v2, s6 1422; GFX10-NEXT: v_mov_b32_e32 v3, s7 1423; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 1424; GFX10-NEXT: s_endpgm 1425 %ld8 = load <8 x i32>, <8 x i32> addrspace(4)* %in, align 16 1426 %id = shufflevector <8 x i32> %ld8, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1427 store <4 x i32> %id, <4 x i32> addrspace(1)* %out, align 8 1428 ret void 1429} 1430 1431declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0 1432declare i32 @llvm.amdgcn.workitem.id.x() #0 1433 1434attributes #0 = { nounwind readnone speculatable } 1435