1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s 4 5define <4 x half> @shuffle_v4f16_23uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 6; GFX9-LABEL: shuffle_v4f16_23uu: 7; GFX9: ; %bb.0: 8; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 10; GFX9-NEXT: s_waitcnt vmcnt(0) 11; GFX9-NEXT: s_setpc_b64 s[30:31] 12; 13; GFX10-LABEL: shuffle_v4f16_23uu: 14; GFX10: ; %bb.0: 15; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 17; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 18; GFX10-NEXT: s_waitcnt vmcnt(0) 19; GFX10-NEXT: s_setpc_b64 s[30:31] 20 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 21 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 22 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 23 ret <4 x half> %shuffle 24} 25 26define <4 x half> @shuffle_v4f16_234u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 27; GFX9-LABEL: shuffle_v4f16_234u: 28; GFX9: ; %bb.0: 29; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 31; GFX9-NEXT: s_waitcnt vmcnt(0) 32; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 33; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 34; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 35; GFX9-NEXT: v_mov_b32_e32 v1, v4 36; GFX9-NEXT: s_waitcnt vmcnt(0) 37; GFX9-NEXT: v_mov_b32_e32 v0, v5 38; GFX9-NEXT: s_setpc_b64 s[30:31] 39; 40; GFX10-LABEL: shuffle_v4f16_234u: 41; GFX10: ; %bb.0: 42; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 43; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 44; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 45; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 46; GFX10-NEXT: s_waitcnt vmcnt(1) 47; GFX10-NEXT: v_mov_b32_e32 v0, v6 48; GFX10-NEXT: s_waitcnt vmcnt(0) 49; GFX10-NEXT: v_mov_b32_e32 v1, v4 50; GFX10-NEXT: s_setpc_b64 s[30:31] 51 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 52 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 53 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef> 54 ret <4 x half> %shuffle 55} 56 57define <4 x half> @shuffle_v4f16_u1u3(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 58; GFX9-LABEL: shuffle_v4f16_u1u3: 59; GFX9: ; %bb.0: 60; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 61; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 62; GFX9-NEXT: s_waitcnt vmcnt(0) 63; GFX9-NEXT: s_setpc_b64 s[30:31] 64; 65; GFX10-LABEL: shuffle_v4f16_u1u3: 66; GFX10: ; %bb.0: 67; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 68; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 69; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 70; GFX10-NEXT: s_waitcnt vmcnt(0) 71; GFX10-NEXT: s_setpc_b64 s[30:31] 72 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 73 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 74 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 1, i32 undef, i32 3> 75 ret <4 x half> %shuffle 76} 77 78define <4 x half> @shuffle_v4f16_u3u1(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 79; GFX9-LABEL: shuffle_v4f16_u3u1: 80; GFX9: ; %bb.0: 81; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 82; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 83; GFX9-NEXT: s_waitcnt vmcnt(0) 84; GFX9-NEXT: v_mov_b32_e32 v0, v2 85; GFX9-NEXT: s_setpc_b64 s[30:31] 86; 87; GFX10-LABEL: shuffle_v4f16_u3u1: 88; GFX10: ; %bb.0: 89; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 90; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 91; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 92; GFX10-NEXT: s_waitcnt vmcnt(0) 93; GFX10-NEXT: v_mov_b32_e32 v0, v2 94; GFX10-NEXT: s_setpc_b64 s[30:31] 95 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 96 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 97 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 1> 98 ret <4 x half> %shuffle 99} 100 101define <4 x half> @shuffle_v4f16_u3uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 102; GFX9-LABEL: shuffle_v4f16_u3uu: 103; GFX9: ; %bb.0: 104; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 105; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 106; GFX9-NEXT: s_waitcnt vmcnt(0) 107; GFX9-NEXT: s_setpc_b64 s[30:31] 108; 109; GFX10-LABEL: shuffle_v4f16_u3uu: 110; GFX10: ; %bb.0: 111; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 112; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 113; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 114; GFX10-NEXT: s_waitcnt vmcnt(0) 115; GFX10-NEXT: s_setpc_b64 s[30:31] 116 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 117 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 118 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef> 119 ret <4 x half> %shuffle 120} 121 122define <4 x half> @shuffle_v4f16_3u6u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 123; GFX9-LABEL: shuffle_v4f16_3u6u: 124; GFX9: ; %bb.0: 125; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 126; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 127; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 128; GFX9-NEXT: s_waitcnt vmcnt(1) 129; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 130; GFX9-NEXT: s_waitcnt vmcnt(0) 131; GFX9-NEXT: v_mov_b32_e32 v1, v4 132; GFX9-NEXT: s_setpc_b64 s[30:31] 133; 134; GFX10-LABEL: shuffle_v4f16_3u6u: 135; GFX10: ; %bb.0: 136; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 137; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 138; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 139; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 140; GFX10-NEXT: s_waitcnt vmcnt(1) 141; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v5 142; GFX10-NEXT: s_waitcnt vmcnt(0) 143; GFX10-NEXT: v_mov_b32_e32 v1, v4 144; GFX10-NEXT: s_setpc_b64 s[30:31] 145 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 146 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 147 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 6, i32 undef> 148 ret <4 x half> %shuffle 149} 150 151define <4 x half> @shuffle_v4f16_3uu7(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 152; GFX9-LABEL: shuffle_v4f16_3uu7: 153; GFX9: ; %bb.0: 154; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 155; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 156; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 157; GFX9-NEXT: s_waitcnt vmcnt(1) 158; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 159; GFX9-NEXT: s_waitcnt vmcnt(0) 160; GFX9-NEXT: v_mov_b32_e32 v1, v4 161; GFX9-NEXT: s_setpc_b64 s[30:31] 162; 163; GFX10-LABEL: shuffle_v4f16_3uu7: 164; GFX10: ; %bb.0: 165; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 166; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 167; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 168; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 169; GFX10-NEXT: s_waitcnt vmcnt(1) 170; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v5 171; GFX10-NEXT: s_waitcnt vmcnt(0) 172; GFX10-NEXT: v_mov_b32_e32 v1, v4 173; GFX10-NEXT: s_setpc_b64 s[30:31] 174 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 175 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 176 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 undef, i32 7> 177 ret <4 x half> %shuffle 178} 179 180define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 181; GFX9-LABEL: shuffle_v4f16_35u5: 182; GFX9: ; %bb.0: 183; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 184; GFX9-NEXT: global_load_dword v4, v[2:3], off 185; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 186; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 187; GFX9-NEXT: s_waitcnt vmcnt(1) 188; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v4 189; GFX9-NEXT: s_waitcnt vmcnt(0) 190; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 191; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 192; GFX9-NEXT: v_mov_b32_e32 v1, v4 193; GFX9-NEXT: s_setpc_b64 s[30:31] 194; 195; GFX10-LABEL: shuffle_v4f16_35u5: 196; GFX10: ; %bb.0: 197; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 198; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 199; GFX10-NEXT: global_load_dword v4, v[2:3], off 200; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 201; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 202; GFX10-NEXT: s_waitcnt vmcnt(1) 203; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v4 204; GFX10-NEXT: s_waitcnt vmcnt(0) 205; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 206; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 207; GFX10-NEXT: v_mov_b32_e32 v1, v4 208; GFX10-NEXT: s_setpc_b64 s[30:31] 209 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 210 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 211 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 undef, i32 5> 212 ret <4 x half> %shuffle 213} 214 215define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 216; GFX9-LABEL: shuffle_v4f16_357u: 217; GFX9: ; %bb.0: 218; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 219; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 220; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 221; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 222; GFX9-NEXT: s_waitcnt vmcnt(1) 223; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4 224; GFX9-NEXT: s_waitcnt vmcnt(0) 225; GFX9-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 226; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5 227; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 228; GFX9-NEXT: s_setpc_b64 s[30:31] 229; 230; GFX10-LABEL: shuffle_v4f16_357u: 231; GFX10: ; %bb.0: 232; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 233; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 234; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 235; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 236; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 237; GFX10-NEXT: s_waitcnt vmcnt(1) 238; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v4 239; GFX10-NEXT: s_waitcnt vmcnt(0) 240; GFX10-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 241; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 242; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v5 243; GFX10-NEXT: s_setpc_b64 s[30:31] 244 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 245 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 246 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 undef> 247 ret <4 x half> %shuffle 248} 249 250define <4 x half> @shuffle_v4f16_0101(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 251; GFX9-LABEL: shuffle_v4f16_0101: 252; GFX9: ; %bb.0: 253; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 254; GFX9-NEXT: global_load_dword v0, v[0:1], off 255; GFX9-NEXT: s_waitcnt vmcnt(0) 256; GFX9-NEXT: v_mov_b32_e32 v1, v0 257; GFX9-NEXT: s_setpc_b64 s[30:31] 258; 259; GFX10-LABEL: shuffle_v4f16_0101: 260; GFX10: ; %bb.0: 261; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 262; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 263; GFX10-NEXT: global_load_dword v0, v[0:1], off 264; GFX10-NEXT: s_waitcnt vmcnt(0) 265; GFX10-NEXT: v_mov_b32_e32 v1, v0 266; GFX10-NEXT: s_setpc_b64 s[30:31] 267 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 268 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 269 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 270 ret <4 x half> %shuffle 271} 272 273define <4 x half> @shuffle_v4f16_0123(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 274; GFX9-LABEL: shuffle_v4f16_0123: 275; GFX9: ; %bb.0: 276; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 277; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 278; GFX9-NEXT: s_waitcnt vmcnt(0) 279; GFX9-NEXT: s_setpc_b64 s[30:31] 280; 281; GFX10-LABEL: shuffle_v4f16_0123: 282; GFX10: ; %bb.0: 283; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 284; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 285; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 286; GFX10-NEXT: s_waitcnt vmcnt(0) 287; GFX10-NEXT: s_setpc_b64 s[30:31] 288 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 289 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 290 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 291 ret <4 x half> %shuffle 292} 293 294define <4 x half> @shuffle_v4f16_0145(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 295; GFX9-LABEL: shuffle_v4f16_0145: 296; GFX9: ; %bb.0: 297; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 298; GFX9-NEXT: global_load_dword v4, v[0:1], off 299; GFX9-NEXT: global_load_dword v5, v[2:3], off 300; GFX9-NEXT: s_waitcnt vmcnt(1) 301; GFX9-NEXT: v_mov_b32_e32 v0, v4 302; GFX9-NEXT: s_waitcnt vmcnt(0) 303; GFX9-NEXT: v_mov_b32_e32 v1, v5 304; GFX9-NEXT: s_setpc_b64 s[30:31] 305; 306; GFX10-LABEL: shuffle_v4f16_0145: 307; GFX10: ; %bb.0: 308; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 309; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 310; GFX10-NEXT: global_load_dword v4, v[0:1], off 311; GFX10-NEXT: global_load_dword v5, v[2:3], off 312; GFX10-NEXT: s_waitcnt vmcnt(1) 313; GFX10-NEXT: v_mov_b32_e32 v0, v4 314; GFX10-NEXT: s_waitcnt vmcnt(0) 315; GFX10-NEXT: v_mov_b32_e32 v1, v5 316; GFX10-NEXT: s_setpc_b64 s[30:31] 317 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 318 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 319 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 320 ret <4 x half> %shuffle 321} 322 323define <4 x half> @shuffle_v4f16_0167(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 324; GFX9-LABEL: shuffle_v4f16_0167: 325; GFX9: ; %bb.0: 326; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 327; GFX9-NEXT: global_load_dword v4, v[0:1], off 328; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 329; GFX9-NEXT: s_waitcnt vmcnt(1) 330; GFX9-NEXT: v_mov_b32_e32 v0, v4 331; GFX9-NEXT: s_waitcnt vmcnt(0) 332; GFX9-NEXT: v_mov_b32_e32 v1, v5 333; GFX9-NEXT: s_setpc_b64 s[30:31] 334; 335; GFX10-LABEL: shuffle_v4f16_0167: 336; GFX10: ; %bb.0: 337; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 338; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 339; GFX10-NEXT: global_load_dword v4, v[0:1], off 340; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 341; GFX10-NEXT: s_waitcnt vmcnt(1) 342; GFX10-NEXT: v_mov_b32_e32 v0, v4 343; GFX10-NEXT: s_waitcnt vmcnt(0) 344; GFX10-NEXT: v_mov_b32_e32 v1, v5 345; GFX10-NEXT: s_setpc_b64 s[30:31] 346 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 347 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 348 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 349 ret <4 x half> %shuffle 350} 351 352define <4 x half> @shuffle_v4f16_2301(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 353; GFX9-LABEL: shuffle_v4f16_2301: 354; GFX9: ; %bb.0: 355; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 356; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 357; GFX9-NEXT: s_waitcnt vmcnt(0) 358; GFX9-NEXT: v_mov_b32_e32 v0, v2 359; GFX9-NEXT: s_setpc_b64 s[30:31] 360; 361; GFX10-LABEL: shuffle_v4f16_2301: 362; GFX10: ; %bb.0: 363; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 364; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 365; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 366; GFX10-NEXT: s_waitcnt vmcnt(0) 367; GFX10-NEXT: v_mov_b32_e32 v0, v2 368; GFX10-NEXT: s_setpc_b64 s[30:31] 369 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 370 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 371 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 372 ret <4 x half> %shuffle 373} 374 375define <4 x half> @shuffle_v4f16_2323(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 376; GFX9-LABEL: shuffle_v4f16_2323: 377; GFX9: ; %bb.0: 378; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 379; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 380; GFX9-NEXT: s_waitcnt vmcnt(0) 381; GFX9-NEXT: v_mov_b32_e32 v1, v0 382; GFX9-NEXT: s_setpc_b64 s[30:31] 383; 384; GFX10-LABEL: shuffle_v4f16_2323: 385; GFX10: ; %bb.0: 386; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 387; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 388; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 389; GFX10-NEXT: s_waitcnt vmcnt(0) 390; GFX10-NEXT: v_mov_b32_e32 v1, v0 391; GFX10-NEXT: s_setpc_b64 s[30:31] 392 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 393 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 394 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 2, i32 3> 395 ret <4 x half> %shuffle 396} 397 398define <4 x half> @shuffle_v4f16_2345(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 399; GFX9-LABEL: shuffle_v4f16_2345: 400; GFX9: ; %bb.0: 401; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 402; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 403; GFX9-NEXT: global_load_dword v5, v[2:3], off 404; GFX9-NEXT: s_waitcnt vmcnt(1) 405; GFX9-NEXT: v_mov_b32_e32 v0, v4 406; GFX9-NEXT: s_waitcnt vmcnt(0) 407; GFX9-NEXT: v_mov_b32_e32 v1, v5 408; GFX9-NEXT: s_setpc_b64 s[30:31] 409; 410; GFX10-LABEL: shuffle_v4f16_2345: 411; GFX10: ; %bb.0: 412; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 413; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 414; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 415; GFX10-NEXT: global_load_dword v5, v[2:3], off 416; GFX10-NEXT: s_waitcnt vmcnt(1) 417; GFX10-NEXT: v_mov_b32_e32 v0, v4 418; GFX10-NEXT: s_waitcnt vmcnt(0) 419; GFX10-NEXT: v_mov_b32_e32 v1, v5 420; GFX10-NEXT: s_setpc_b64 s[30:31] 421 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 422 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 423 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 424 ret <4 x half> %shuffle 425} 426 427define <4 x half> @shuffle_v4f16_2367(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 428; GFX9-LABEL: shuffle_v4f16_2367: 429; GFX9: ; %bb.0: 430; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 431; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 432; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 433; GFX9-NEXT: s_waitcnt vmcnt(1) 434; GFX9-NEXT: v_mov_b32_e32 v0, v4 435; GFX9-NEXT: s_waitcnt vmcnt(0) 436; GFX9-NEXT: v_mov_b32_e32 v1, v5 437; GFX9-NEXT: s_setpc_b64 s[30:31] 438; 439; GFX10-LABEL: shuffle_v4f16_2367: 440; GFX10: ; %bb.0: 441; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 442; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 443; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 444; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 445; GFX10-NEXT: s_waitcnt vmcnt(1) 446; GFX10-NEXT: v_mov_b32_e32 v0, v4 447; GFX10-NEXT: s_waitcnt vmcnt(0) 448; GFX10-NEXT: v_mov_b32_e32 v1, v5 449; GFX10-NEXT: s_setpc_b64 s[30:31] 450 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 451 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 452 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 453 ret <4 x half> %shuffle 454} 455 456define <4 x half> @shuffle_v4f16_4501(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 457; GFX9-LABEL: shuffle_v4f16_4501: 458; GFX9: ; %bb.0: 459; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 460; GFX9-NEXT: global_load_dword v4, v[2:3], off 461; GFX9-NEXT: global_load_dword v5, v[0:1], off 462; GFX9-NEXT: s_waitcnt vmcnt(1) 463; GFX9-NEXT: v_mov_b32_e32 v0, v4 464; GFX9-NEXT: s_waitcnt vmcnt(0) 465; GFX9-NEXT: v_mov_b32_e32 v1, v5 466; GFX9-NEXT: s_setpc_b64 s[30:31] 467; 468; GFX10-LABEL: shuffle_v4f16_4501: 469; GFX10: ; %bb.0: 470; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 471; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 472; GFX10-NEXT: global_load_dword v4, v[2:3], off 473; GFX10-NEXT: global_load_dword v5, v[0:1], off 474; GFX10-NEXT: s_waitcnt vmcnt(1) 475; GFX10-NEXT: v_mov_b32_e32 v0, v4 476; GFX10-NEXT: s_waitcnt vmcnt(0) 477; GFX10-NEXT: v_mov_b32_e32 v1, v5 478; GFX10-NEXT: s_setpc_b64 s[30:31] 479 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 480 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 481 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 482 ret <4 x half> %shuffle 483} 484 485define <4 x half> @shuffle_v4f16_4523(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 486; GFX9-LABEL: shuffle_v4f16_4523: 487; GFX9: ; %bb.0: 488; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 489; GFX9-NEXT: global_load_dword v4, v[2:3], off 490; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 491; GFX9-NEXT: s_waitcnt vmcnt(1) 492; GFX9-NEXT: v_mov_b32_e32 v0, v4 493; GFX9-NEXT: s_waitcnt vmcnt(0) 494; GFX9-NEXT: v_mov_b32_e32 v1, v5 495; GFX9-NEXT: s_setpc_b64 s[30:31] 496; 497; GFX10-LABEL: shuffle_v4f16_4523: 498; GFX10: ; %bb.0: 499; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 500; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 501; GFX10-NEXT: global_load_dword v4, v[2:3], off 502; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 503; GFX10-NEXT: s_waitcnt vmcnt(1) 504; GFX10-NEXT: v_mov_b32_e32 v0, v4 505; GFX10-NEXT: s_waitcnt vmcnt(0) 506; GFX10-NEXT: v_mov_b32_e32 v1, v5 507; GFX10-NEXT: s_setpc_b64 s[30:31] 508 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 509 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 510 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 511 ret <4 x half> %shuffle 512} 513 514define <4 x half> @shuffle_v4f16_4545(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 515; GFX9-LABEL: shuffle_v4f16_4545: 516; GFX9: ; %bb.0: 517; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 518; GFX9-NEXT: global_load_dword v0, v[2:3], off 519; GFX9-NEXT: s_waitcnt vmcnt(0) 520; GFX9-NEXT: v_mov_b32_e32 v1, v0 521; GFX9-NEXT: s_setpc_b64 s[30:31] 522; 523; GFX10-LABEL: shuffle_v4f16_4545: 524; GFX10: ; %bb.0: 525; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 526; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 527; GFX10-NEXT: global_load_dword v0, v[2:3], off 528; GFX10-NEXT: s_waitcnt vmcnt(0) 529; GFX10-NEXT: v_mov_b32_e32 v1, v0 530; GFX10-NEXT: s_setpc_b64 s[30:31] 531 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 532 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 533 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 4, i32 5> 534 ret <4 x half> %shuffle 535} 536 537define <4 x half> @shuffle_v4f16_4567(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 538; GFX9-LABEL: shuffle_v4f16_4567: 539; GFX9: ; %bb.0: 540; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 541; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off 542; GFX9-NEXT: s_waitcnt vmcnt(0) 543; GFX9-NEXT: s_setpc_b64 s[30:31] 544; 545; GFX10-LABEL: shuffle_v4f16_4567: 546; GFX10: ; %bb.0: 547; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 548; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 549; GFX10-NEXT: global_load_dwordx2 v[0:1], v[2:3], off 550; GFX10-NEXT: s_waitcnt vmcnt(0) 551; GFX10-NEXT: s_setpc_b64 s[30:31] 552 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 553 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 554 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 555 ret <4 x half> %shuffle 556} 557 558define <4 x half> @shuffle_v4f16_6701(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 559; GFX9-LABEL: shuffle_v4f16_6701: 560; GFX9: ; %bb.0: 561; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 562; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 563; GFX9-NEXT: global_load_dword v5, v[0:1], off 564; GFX9-NEXT: s_waitcnt vmcnt(1) 565; GFX9-NEXT: v_mov_b32_e32 v0, v4 566; GFX9-NEXT: s_waitcnt vmcnt(0) 567; GFX9-NEXT: v_mov_b32_e32 v1, v5 568; GFX9-NEXT: s_setpc_b64 s[30:31] 569; 570; GFX10-LABEL: shuffle_v4f16_6701: 571; GFX10: ; %bb.0: 572; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 573; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 574; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 575; GFX10-NEXT: global_load_dword v5, v[0:1], off 576; GFX10-NEXT: s_waitcnt vmcnt(1) 577; GFX10-NEXT: v_mov_b32_e32 v0, v4 578; GFX10-NEXT: s_waitcnt vmcnt(0) 579; GFX10-NEXT: v_mov_b32_e32 v1, v5 580; GFX10-NEXT: s_setpc_b64 s[30:31] 581 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 582 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 583 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 584 ret <4 x half> %shuffle 585} 586 587define <4 x half> @shuffle_v4f16_6723(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 588; GFX9-LABEL: shuffle_v4f16_6723: 589; GFX9: ; %bb.0: 590; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 591; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 592; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 593; GFX9-NEXT: s_waitcnt vmcnt(1) 594; GFX9-NEXT: v_mov_b32_e32 v0, v4 595; GFX9-NEXT: s_waitcnt vmcnt(0) 596; GFX9-NEXT: v_mov_b32_e32 v1, v5 597; GFX9-NEXT: s_setpc_b64 s[30:31] 598; 599; GFX10-LABEL: shuffle_v4f16_6723: 600; GFX10: ; %bb.0: 601; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 602; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 603; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 604; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 605; GFX10-NEXT: s_waitcnt vmcnt(1) 606; GFX10-NEXT: v_mov_b32_e32 v0, v4 607; GFX10-NEXT: s_waitcnt vmcnt(0) 608; GFX10-NEXT: v_mov_b32_e32 v1, v5 609; GFX10-NEXT: s_setpc_b64 s[30:31] 610 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 611 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 612 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 2, i32 3> 613 ret <4 x half> %shuffle 614} 615 616define <4 x half> @shuffle_v4f16_6745(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 617; GFX9-LABEL: shuffle_v4f16_6745: 618; GFX9: ; %bb.0: 619; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 620; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off 621; GFX9-NEXT: s_waitcnt vmcnt(0) 622; GFX9-NEXT: v_mov_b32_e32 v0, v2 623; GFX9-NEXT: s_setpc_b64 s[30:31] 624; 625; GFX10-LABEL: shuffle_v4f16_6745: 626; GFX10: ; %bb.0: 627; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 628; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 629; GFX10-NEXT: global_load_dwordx2 v[1:2], v[2:3], off 630; GFX10-NEXT: s_waitcnt vmcnt(0) 631; GFX10-NEXT: v_mov_b32_e32 v0, v2 632; GFX10-NEXT: s_setpc_b64 s[30:31] 633 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 634 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 635 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 4, i32 5> 636 ret <4 x half> %shuffle 637} 638 639define <4 x half> @shuffle_v4f16_6767(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 640; GFX9-LABEL: shuffle_v4f16_6767: 641; GFX9: ; %bb.0: 642; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 643; GFX9-NEXT: global_load_dword v0, v[2:3], off offset:4 644; GFX9-NEXT: s_waitcnt vmcnt(0) 645; GFX9-NEXT: v_mov_b32_e32 v1, v0 646; GFX9-NEXT: s_setpc_b64 s[30:31] 647; 648; GFX10-LABEL: shuffle_v4f16_6767: 649; GFX10: ; %bb.0: 650; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 651; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 652; GFX10-NEXT: global_load_dword v0, v[2:3], off offset:4 653; GFX10-NEXT: s_waitcnt vmcnt(0) 654; GFX10-NEXT: v_mov_b32_e32 v1, v0 655; GFX10-NEXT: s_setpc_b64 s[30:31] 656 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 657 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 658 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 6, i32 7> 659 ret <4 x half> %shuffle 660} 661 662define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 663; GFX9-LABEL: shuffle_v4f16_2356: 664; GFX9: ; %bb.0: 665; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 666; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 667; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 668; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 669; GFX9-NEXT: s_waitcnt vmcnt(1) 670; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 671; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v0 672; GFX9-NEXT: s_waitcnt vmcnt(0) 673; GFX9-NEXT: v_mov_b32_e32 v0, v4 674; GFX9-NEXT: s_setpc_b64 s[30:31] 675; 676; GFX10-LABEL: shuffle_v4f16_2356: 677; GFX10: ; %bb.0: 678; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 679; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 680; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 681; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 682; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 683; GFX10-NEXT: s_waitcnt vmcnt(1) 684; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 685; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v0 686; GFX10-NEXT: s_waitcnt vmcnt(0) 687; GFX10-NEXT: v_mov_b32_e32 v0, v4 688; GFX10-NEXT: s_setpc_b64 s[30:31] 689 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 690 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 691 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6> 692 ret <4 x half> %shuffle 693} 694 695define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 696; GFX9-LABEL: shuffle_v4f16_5623: 697; GFX9: ; %bb.0: 698; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 699; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 700; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 701; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 702; GFX9-NEXT: s_waitcnt vmcnt(1) 703; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 704; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 705; GFX9-NEXT: s_waitcnt vmcnt(0) 706; GFX9-NEXT: v_mov_b32_e32 v1, v4 707; GFX9-NEXT: s_setpc_b64 s[30:31] 708; 709; GFX10-LABEL: shuffle_v4f16_5623: 710; GFX10: ; %bb.0: 711; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 712; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 713; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 714; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 715; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 716; GFX10-NEXT: s_waitcnt vmcnt(1) 717; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 718; GFX10-NEXT: s_waitcnt vmcnt(0) 719; GFX10-NEXT: v_mov_b32_e32 v1, v4 720; GFX10-NEXT: v_lshl_or_b32 v0, v6, 16, v0 721; GFX10-NEXT: s_setpc_b64 s[30:31] 722 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 723 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 724 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3> 725 ret <4 x half> %shuffle 726} 727 728define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 729; GFX9-LABEL: shuffle_v4f16_3456: 730; GFX9: ; %bb.0: 731; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 732; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 733; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 734; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 735; GFX9-NEXT: s_waitcnt vmcnt(1) 736; GFX9-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 737; GFX9-NEXT: s_waitcnt vmcnt(0) 738; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 739; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v1 740; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v2 741; GFX9-NEXT: s_setpc_b64 s[30:31] 742; 743; GFX10-LABEL: shuffle_v4f16_3456: 744; GFX10: ; %bb.0: 745; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 746; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 747; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 748; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 749; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 750; GFX10-NEXT: s_waitcnt vmcnt(1) 751; GFX10-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 752; GFX10-NEXT: s_waitcnt vmcnt(0) 753; GFX10-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 754; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v1 755; GFX10-NEXT: v_lshl_or_b32 v1, v5, 16, v2 756; GFX10-NEXT: s_setpc_b64 s[30:31] 757 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 758 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 759 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6> 760 ret <4 x half> %shuffle 761} 762 763define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 764; GFX9-LABEL: shuffle_v4f16_5634: 765; GFX9: ; %bb.0: 766; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 767; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 768; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 769; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 770; GFX9-NEXT: s_waitcnt vmcnt(0) 771; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 772; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 773; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 774; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 775; GFX9-NEXT: s_setpc_b64 s[30:31] 776; 777; GFX10-LABEL: shuffle_v4f16_5634: 778; GFX10: ; %bb.0: 779; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 780; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 781; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 782; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 783; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 784; GFX10-NEXT: s_waitcnt vmcnt(1) 785; GFX10-NEXT: v_and_b32_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 786; GFX10-NEXT: s_waitcnt vmcnt(0) 787; GFX10-NEXT: v_and_b32_sdwa v2, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 788; GFX10-NEXT: v_lshl_or_b32 v0, v5, 16, v1 789; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v2 790; GFX10-NEXT: s_setpc_b64 s[30:31] 791 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 792 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 793 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4> 794 ret <4 x half> %shuffle 795} 796 797define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 798; GFX9-LABEL: shuffle_v4f16_5734: 799; GFX9: ; %bb.0: 800; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 801; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 802; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 803; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 804; GFX9-NEXT: s_waitcnt vmcnt(1) 805; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v5 806; GFX9-NEXT: s_waitcnt vmcnt(0) 807; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 808; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 809; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 810; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 811; GFX9-NEXT: s_setpc_b64 s[30:31] 812; 813; GFX10-LABEL: shuffle_v4f16_5734: 814; GFX10: ; %bb.0: 815; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 816; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 817; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 818; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 819; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 820; GFX10-NEXT: s_waitcnt vmcnt(1) 821; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v5 822; GFX10-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 823; GFX10-NEXT: s_waitcnt vmcnt(0) 824; GFX10-NEXT: v_and_b32_sdwa v3, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 825; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v2 826; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 827; GFX10-NEXT: s_setpc_b64 s[30:31] 828 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 829 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 830 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4> 831 ret <4 x half> %shuffle 832} 833 834define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) { 835; GFX9-LABEL: shuffle_v4i16_2356: 836; GFX9: ; %bb.0: 837; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 838; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 839; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 840; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 841; GFX9-NEXT: s_waitcnt vmcnt(1) 842; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 843; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v0 844; GFX9-NEXT: s_waitcnt vmcnt(0) 845; GFX9-NEXT: v_mov_b32_e32 v0, v4 846; GFX9-NEXT: s_setpc_b64 s[30:31] 847; 848; GFX10-LABEL: shuffle_v4i16_2356: 849; GFX10: ; %bb.0: 850; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 851; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 852; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 853; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 854; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 855; GFX10-NEXT: s_waitcnt vmcnt(1) 856; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 857; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v0 858; GFX10-NEXT: s_waitcnt vmcnt(0) 859; GFX10-NEXT: v_mov_b32_e32 v0, v4 860; GFX10-NEXT: s_setpc_b64 s[30:31] 861 %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 862 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 863 %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6> 864 ret <4 x i16> %shuffle 865} 866 867define <4 x i16> @shuffle_v4i16_0167(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) { 868; GFX9-LABEL: shuffle_v4i16_0167: 869; GFX9: ; %bb.0: 870; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 871; GFX9-NEXT: global_load_dword v4, v[0:1], off 872; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 873; GFX9-NEXT: s_waitcnt vmcnt(1) 874; GFX9-NEXT: v_mov_b32_e32 v0, v4 875; GFX9-NEXT: s_waitcnt vmcnt(0) 876; GFX9-NEXT: v_mov_b32_e32 v1, v5 877; GFX9-NEXT: s_setpc_b64 s[30:31] 878; 879; GFX10-LABEL: shuffle_v4i16_0167: 880; GFX10: ; %bb.0: 881; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 882; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 883; GFX10-NEXT: global_load_dword v4, v[0:1], off 884; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 885; GFX10-NEXT: s_waitcnt vmcnt(1) 886; GFX10-NEXT: v_mov_b32_e32 v0, v4 887; GFX10-NEXT: s_waitcnt vmcnt(0) 888; GFX10-NEXT: v_mov_b32_e32 v1, v5 889; GFX10-NEXT: s_setpc_b64 s[30:31] 890 %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 891 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 892 %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 893 ret <4 x i16> %shuffle 894} 895 896define <4 x half> @shuffle_v4f16_0000(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 897; GFX9-LABEL: shuffle_v4f16_0000: 898; GFX9: ; %bb.0: 899; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 900; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 901; GFX9-NEXT: s_waitcnt vmcnt(0) 902; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 903; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 904; GFX9-NEXT: v_mov_b32_e32 v1, v0 905; GFX9-NEXT: s_setpc_b64 s[30:31] 906; 907; GFX10-LABEL: shuffle_v4f16_0000: 908; GFX10: ; %bb.0: 909; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 910; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 911; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 912; GFX10-NEXT: s_waitcnt vmcnt(0) 913; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v0 914; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 915; GFX10-NEXT: v_mov_b32_e32 v1, v0 916; GFX10-NEXT: s_setpc_b64 s[30:31] 917 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 918 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 919 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> zeroinitializer 920 ret <4 x half> %shuffle 921} 922 923define <4 x half> @shuffle_v4f16_1010(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 924; GFX9-LABEL: shuffle_v4f16_1010: 925; GFX9: ; %bb.0: 926; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 927; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 928; GFX9-NEXT: s_waitcnt vmcnt(0) 929; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 930; GFX9-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 931; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 932; GFX9-NEXT: v_mov_b32_e32 v1, v0 933; GFX9-NEXT: s_setpc_b64 s[30:31] 934; 935; GFX10-LABEL: shuffle_v4f16_1010: 936; GFX10: ; %bb.0: 937; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 938; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 939; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 940; GFX10-NEXT: s_waitcnt vmcnt(0) 941; GFX10-NEXT: v_mov_b32_e32 v1, 0xffff 942; GFX10-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 943; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 944; GFX10-NEXT: v_mov_b32_e32 v1, v0 945; GFX10-NEXT: s_setpc_b64 s[30:31] 946 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 947 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 948 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 0, i32 1, i32 0> 949 ret <4 x half> %shuffle 950} 951 952define <4 x half> @shuffle_v4f16_1100(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 953; GFX9-LABEL: shuffle_v4f16_1100: 954; GFX9: ; %bb.0: 955; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 956; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 957; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff 958; GFX9-NEXT: s_waitcnt vmcnt(0) 959; GFX9-NEXT: v_and_b32_e32 v1, v2, v0 960; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 961; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 962; GFX9-NEXT: v_and_b32_e32 v0, v2, v3 963; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 964; GFX9-NEXT: s_setpc_b64 s[30:31] 965; 966; GFX10-LABEL: shuffle_v4f16_1100: 967; GFX10: ; %bb.0: 968; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 969; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 970; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 971; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 972; GFX10-NEXT: s_waitcnt vmcnt(0) 973; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 974; GFX10-NEXT: v_and_b32_e32 v4, v0, v1 975; GFX10-NEXT: v_and_b32_e32 v3, v0, v2 976; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v4 977; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v3 978; GFX10-NEXT: s_setpc_b64 s[30:31] 979 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 980 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 981 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0> 982 ret <4 x half> %shuffle 983} 984 985define <4 x half> @shuffle_v4f16_6161(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 986; GFX9-LABEL: shuffle_v4f16_6161: 987; GFX9: ; %bb.0: 988; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 989; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 990; GFX9-NEXT: global_load_dword v5, v[0:1], off 991; GFX9-NEXT: s_waitcnt vmcnt(1) 992; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v4 993; GFX9-NEXT: s_waitcnt vmcnt(0) 994; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 995; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 996; GFX9-NEXT: v_mov_b32_e32 v1, v0 997; GFX9-NEXT: s_setpc_b64 s[30:31] 998; 999; GFX10-LABEL: shuffle_v4f16_6161: 1000; GFX10: ; %bb.0: 1001; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1002; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1003; GFX10-NEXT: global_load_dword v4, v[0:1], off 1004; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 1005; GFX10-NEXT: s_waitcnt vmcnt(1) 1006; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v4 1007; GFX10-NEXT: s_waitcnt vmcnt(0) 1008; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v5 1009; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 1010; GFX10-NEXT: v_mov_b32_e32 v1, v0 1011; GFX10-NEXT: s_setpc_b64 s[30:31] 1012 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 1013 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 1014 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1> 1015 ret <4 x half> %shuffle 1016} 1017 1018define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 1019; GFX9-LABEL: shuffle_v4f16_2333: 1020; GFX9: ; %bb.0: 1021; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1022; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 1023; GFX9-NEXT: s_waitcnt vmcnt(0) 1024; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1025; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 1026; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 1027; GFX9-NEXT: s_setpc_b64 s[30:31] 1028; 1029; GFX10-LABEL: shuffle_v4f16_2333: 1030; GFX10: ; %bb.0: 1031; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1032; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1033; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 1034; GFX10-NEXT: s_waitcnt vmcnt(0) 1035; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1036; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v1 1037; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 1038; GFX10-NEXT: s_setpc_b64 s[30:31] 1039 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 1040 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 1041 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3> 1042 ret <4 x half> %shuffle 1043} 1044 1045define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 1046; GFX9-LABEL: shuffle_v4f16_6667: 1047; GFX9: ; %bb.0: 1048; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1049; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 1050; GFX9-NEXT: s_waitcnt vmcnt(0) 1051; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1052; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 1053; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 1054; GFX9-NEXT: s_setpc_b64 s[30:31] 1055; 1056; GFX10-LABEL: shuffle_v4f16_6667: 1057; GFX10: ; %bb.0: 1058; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1059; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1060; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 1061; GFX10-NEXT: s_waitcnt vmcnt(0) 1062; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1063; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v1 1064; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 1065; GFX10-NEXT: s_setpc_b64 s[30:31] 1066 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 1067 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 1068 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3> 1069 ret <4 x half> %shuffle 1070} 1071 1072define <4 x half> @shuffle_v8f16_0101(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 1073; GFX9-LABEL: shuffle_v8f16_0101: 1074; GFX9: ; %bb.0: 1075; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1076; GFX9-NEXT: global_load_dword v0, v[0:1], off 1077; GFX9-NEXT: s_waitcnt vmcnt(0) 1078; GFX9-NEXT: v_mov_b32_e32 v1, v0 1079; GFX9-NEXT: s_setpc_b64 s[30:31] 1080; 1081; GFX10-LABEL: shuffle_v8f16_0101: 1082; GFX10: ; %bb.0: 1083; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1084; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1085; GFX10-NEXT: global_load_dword v0, v[0:1], off 1086; GFX10-NEXT: s_waitcnt vmcnt(0) 1087; GFX10-NEXT: v_mov_b32_e32 v1, v0 1088; GFX10-NEXT: s_setpc_b64 s[30:31] 1089 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 1090 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 1091 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 1092 ret <4 x half> %shuffle 1093} 1094 1095define <4 x half> @shuffle_v8f16_0123(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 1096; GFX9-LABEL: shuffle_v8f16_0123: 1097; GFX9: ; %bb.0: 1098; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1099; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1100; GFX9-NEXT: s_waitcnt vmcnt(0) 1101; GFX9-NEXT: s_setpc_b64 s[30:31] 1102; 1103; GFX10-LABEL: shuffle_v8f16_0123: 1104; GFX10: ; %bb.0: 1105; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1106; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1107; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1108; GFX10-NEXT: s_waitcnt vmcnt(0) 1109; GFX10-NEXT: s_setpc_b64 s[30:31] 1110 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 1111 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 1112 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1113 ret <4 x half> %shuffle 1114} 1115 1116define <4 x half> @shuffle_v8f16_4589(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 1117; GFX9-LABEL: shuffle_v8f16_4589: 1118; GFX9: ; %bb.0: 1119; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1120; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:8 1121; GFX9-NEXT: global_load_dword v5, v[2:3], off 1122; GFX9-NEXT: s_waitcnt vmcnt(1) 1123; GFX9-NEXT: v_mov_b32_e32 v0, v4 1124; GFX9-NEXT: s_waitcnt vmcnt(0) 1125; GFX9-NEXT: v_mov_b32_e32 v1, v5 1126; GFX9-NEXT: s_setpc_b64 s[30:31] 1127; 1128; GFX10-LABEL: shuffle_v8f16_4589: 1129; GFX10: ; %bb.0: 1130; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1131; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1132; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:8 1133; GFX10-NEXT: global_load_dword v5, v[2:3], off 1134; GFX10-NEXT: s_waitcnt vmcnt(1) 1135; GFX10-NEXT: v_mov_b32_e32 v0, v4 1136; GFX10-NEXT: s_waitcnt vmcnt(0) 1137; GFX10-NEXT: v_mov_b32_e32 v1, v5 1138; GFX10-NEXT: s_setpc_b64 s[30:31] 1139 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 1140 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 1141 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 4, i32 5, i32 8, i32 9> 1142 ret <4 x half> %shuffle 1143} 1144 1145define <4 x half> @shuffle_v8f16_10_11_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 1146; GFX9-LABEL: shuffle_v8f16_10_11_2_3: 1147; GFX9: ; %bb.0: 1148; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1149; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 1150; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 1151; GFX9-NEXT: s_waitcnt vmcnt(1) 1152; GFX9-NEXT: v_mov_b32_e32 v0, v4 1153; GFX9-NEXT: s_waitcnt vmcnt(0) 1154; GFX9-NEXT: v_mov_b32_e32 v1, v5 1155; GFX9-NEXT: s_setpc_b64 s[30:31] 1156; 1157; GFX10-LABEL: shuffle_v8f16_10_11_2_3: 1158; GFX10: ; %bb.0: 1159; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1160; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1161; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 1162; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 1163; GFX10-NEXT: s_waitcnt vmcnt(1) 1164; GFX10-NEXT: v_mov_b32_e32 v0, v4 1165; GFX10-NEXT: s_waitcnt vmcnt(0) 1166; GFX10-NEXT: v_mov_b32_e32 v1, v5 1167; GFX10-NEXT: s_setpc_b64 s[30:31] 1168 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 1169 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 1170 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 10, i32 11, i32 2, i32 3> 1171 ret <4 x half> %shuffle 1172} 1173 1174define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 1175; GFX9-LABEL: shuffle_v8f16_13_14_2_3: 1176; GFX9: ; %bb.0: 1177; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1178; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8 1179; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 1180; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 1181; GFX9-NEXT: s_waitcnt vmcnt(1) 1182; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1183; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 1184; GFX9-NEXT: s_waitcnt vmcnt(0) 1185; GFX9-NEXT: v_mov_b32_e32 v1, v4 1186; GFX9-NEXT: s_setpc_b64 s[30:31] 1187; 1188; GFX10-LABEL: shuffle_v8f16_13_14_2_3: 1189; GFX10: ; %bb.0: 1190; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1191; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1192; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8 1193; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 1194; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 1195; GFX10-NEXT: s_waitcnt vmcnt(1) 1196; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1197; GFX10-NEXT: s_waitcnt vmcnt(0) 1198; GFX10-NEXT: v_mov_b32_e32 v1, v4 1199; GFX10-NEXT: v_lshl_or_b32 v0, v6, 16, v0 1200; GFX10-NEXT: s_setpc_b64 s[30:31] 1201 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 1202 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 1203 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3> 1204 ret <4 x half> %shuffle 1205} 1206 1207define <4 x half> @shuffle_v3f16_0122(<3 x half> addrspace(1)* %arg0, <3 x half> addrspace(1)* %arg1) { 1208; GFX9-LABEL: shuffle_v3f16_0122: 1209; GFX9: ; %bb.0: 1210; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1211; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1212; GFX9-NEXT: s_waitcnt vmcnt(0) 1213; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 1214; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 1215; GFX9-NEXT: s_setpc_b64 s[30:31] 1216; 1217; GFX10-LABEL: shuffle_v3f16_0122: 1218; GFX10: ; %bb.0: 1219; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1220; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1221; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1222; GFX10-NEXT: s_waitcnt vmcnt(0) 1223; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v1 1224; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 1225; GFX10-NEXT: s_setpc_b64 s[30:31] 1226 %val0 = load <3 x half>, <3 x half> addrspace(1)* %arg0 1227 %val1 = load <3 x half>, <3 x half> addrspace(1)* %arg1 1228 %shuffle = shufflevector <3 x half> %val0, <3 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 1229 ret <4 x half> %shuffle 1230} 1231 1232define <4 x half> @shuffle_v2f16_0122(<2 x half> addrspace(1)* %arg0, <2 x half> addrspace(1)* %arg1) { 1233; GFX9-LABEL: shuffle_v2f16_0122: 1234; GFX9: ; %bb.0: 1235; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1236; GFX9-NEXT: global_load_dword v0, v[0:1], off 1237; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 1238; GFX9-NEXT: s_waitcnt vmcnt(0) 1239; GFX9-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1240; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 1241; GFX9-NEXT: s_setpc_b64 s[30:31] 1242; 1243; GFX10-LABEL: shuffle_v2f16_0122: 1244; GFX10: ; %bb.0: 1245; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1246; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1247; GFX10-NEXT: global_load_dword v0, v[0:1], off 1248; GFX10-NEXT: v_mov_b32_e32 v1, 0xffff 1249; GFX10-NEXT: s_waitcnt vmcnt(0) 1250; GFX10-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1251; GFX10-NEXT: v_lshl_or_b32 v1, v0, 16, v1 1252; GFX10-NEXT: s_setpc_b64 s[30:31] 1253 %val0 = load <2 x half>, <2 x half> addrspace(1)* %arg0 1254 %val1 = load <2 x half>, <2 x half> addrspace(1)* %arg1 1255 %shuffle = shufflevector <2 x half> %val0, <2 x half> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0> 1256 ret <4 x half> %shuffle 1257} 1258 1259define <6 x half> @shuffle_v6f16_452367(<6 x half> addrspace(1)* %arg0, <6 x half> addrspace(1)* %arg1) { 1260; GFX9-LABEL: shuffle_v6f16_452367: 1261; GFX9: ; %bb.0: 1262; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1263; GFX9-NEXT: v_mov_b32_e32 v6, v1 1264; GFX9-NEXT: v_mov_b32_e32 v5, v0 1265; GFX9-NEXT: v_mov_b32_e32 v4, v3 1266; GFX9-NEXT: v_mov_b32_e32 v3, v2 1267; GFX9-NEXT: global_load_dwordx3 v[0:2], v[5:6], off 1268; GFX9-NEXT: global_load_dword v7, v[3:4], off 1269; GFX9-NEXT: s_waitcnt vmcnt(1) 1270; GFX9-NEXT: v_mov_b32_e32 v0, v2 1271; GFX9-NEXT: s_waitcnt vmcnt(0) 1272; GFX9-NEXT: v_mov_b32_e32 v2, v7 1273; GFX9-NEXT: s_setpc_b64 s[30:31] 1274; 1275; GFX10-LABEL: shuffle_v6f16_452367: 1276; GFX10: ; %bb.0: 1277; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1278; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1279; GFX10-NEXT: v_mov_b32_e32 v6, v1 1280; GFX10-NEXT: v_mov_b32_e32 v5, v0 1281; GFX10-NEXT: v_mov_b32_e32 v4, v3 1282; GFX10-NEXT: v_mov_b32_e32 v3, v2 1283; GFX10-NEXT: global_load_dwordx3 v[0:2], v[5:6], off 1284; GFX10-NEXT: global_load_dword v7, v[3:4], off 1285; GFX10-NEXT: s_waitcnt vmcnt(1) 1286; GFX10-NEXT: v_mov_b32_e32 v0, v2 1287; GFX10-NEXT: s_waitcnt vmcnt(0) 1288; GFX10-NEXT: v_mov_b32_e32 v2, v7 1289; GFX10-NEXT: s_setpc_b64 s[30:31] 1290 %val0 = load <6 x half>, <6 x half> addrspace(1)* %arg0 1291 %val1 = load <6 x half>, <6 x half> addrspace(1)* %arg1 1292 %shuffle = shufflevector <6 x half> %val0, <6 x half> %val1, <6 x i32> <i32 4, i32 5, i32 2, i32 3, i32 6, i32 7> 1293 ret <6 x half> %shuffle 1294} 1295 1296define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readonly %A, <4 x half> addrspace(1)* nocapture readonly %B, <4 x half> addrspace(1)* nocapture %C) { 1297; GFX9-LABEL: fma_shuffle: 1298; GFX9: ; %bb.0: ; %entry 1299; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1300; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 1301; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0 1302; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1303; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] 1304; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] 1305; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7] 1306; GFX9-NEXT: s_waitcnt vmcnt(0) 1307; GFX9-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] 1308; GFX9-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] 1309; GFX9-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] 1310; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] 1311; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] 1312; GFX9-NEXT: s_endpgm 1313; 1314; GFX10-LABEL: fma_shuffle: 1315; GFX10: ; %bb.0: ; %entry 1316; GFX10-NEXT: s_clause 0x1 1317; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1318; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 1319; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 1320; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1321; GFX10-NEXT: s_clause 0x2 1322; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] 1323; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] 1324; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7] 1325; GFX10-NEXT: s_waitcnt vmcnt(0) 1326; GFX10-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] 1327; GFX10-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] 1328; GFX10-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] 1329; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] 1330; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] 1331; GFX10-NEXT: s_endpgm 1332entry: 1333 %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() 1334 %tmp12 = zext i32 %tmp1 to i64 1335 %arrayidx = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %A, i64 %tmp12 1336 %tmp14 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx, align 8 1337 %arrayidx1 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %B, i64 %tmp12 1338 %tmp15 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx1, align 8 1339 %arrayidx2 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %C, i64 %tmp12 1340 %tmp16 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx2, align 8 1341 %tmp17 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> zeroinitializer 1342 %tmp18 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 0, i32 1> 1343 %tmp19 = shufflevector <4 x half> %tmp16, <4 x half> undef, <2 x i32> <i32 0, i32 1> 1344 %tmp20 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp17, <2 x half> %tmp18, <2 x half> %tmp19) 1345 %tmp21 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 1, i32 1> 1346 %tmp22 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 2, i32 3> 1347 %tmp23 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp21, <2 x half> %tmp22, <2 x half> %tmp20) 1348 %tmp24 = shufflevector <2 x half> %tmp23, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1349 %tmp25 = shufflevector <4 x half> %tmp24, <4 x half> %tmp16, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 1350 %tmp26 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 2, i32 2> 1351 %tmp27 = shufflevector <4 x half> %tmp25, <4 x half> undef, <2 x i32> <i32 2, i32 3> 1352 %tmp28 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp26, <2 x half> %tmp18, <2 x half> %tmp27) 1353 %tmp29 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 3, i32 3> 1354 %tmp30 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp29, <2 x half> %tmp22, <2 x half> %tmp28) 1355 %tmp31 = shufflevector <2 x half> %tmp30, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1356 %tmp32 = shufflevector <4 x half> %tmp25, <4 x half> %tmp31, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1357 store <4 x half> %tmp32, <4 x half> addrspace(1)* %arrayidx2, align 8 1358 ret void 1359} 1360 1361define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 1362; GFX9-LABEL: shuffle_v4f16_0456: 1363; GFX9: ; %bb.0: 1364; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1365; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off 1366; GFX9-NEXT: s_waitcnt vmcnt(0) 1367; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 1368; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 1369; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 1370; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 1371; GFX9-NEXT: v_and_b32_e32 v1, v0, v4 1372; GFX9-NEXT: s_waitcnt vmcnt(0) 1373; GFX9-NEXT: v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1374; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v1 1375; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v2 1376; GFX9-NEXT: s_setpc_b64 s[30:31] 1377; 1378; GFX10-LABEL: shuffle_v4f16_0456: 1379; GFX10: ; %bb.0: 1380; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1381; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1382; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off 1383; GFX10-NEXT: s_waitcnt vmcnt(0) 1384; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 1385; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1 1386; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff 1387; GFX10-NEXT: ; kill: killed $vgpr2 killed $vgpr3 1388; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 1389; GFX10-NEXT: s_waitcnt vmcnt(0) 1390; GFX10-NEXT: v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1391; GFX10-NEXT: v_lshl_or_b32 v0, v5, 16, v1 1392; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v2 1393; GFX10-NEXT: s_setpc_b64 s[30:31] 1394 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 1395 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 1396 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6> 1397 ret <4 x half> %shuffle 1398} 1399 1400define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(<8 x i32> addrspace(4)* %in, <4 x i32> addrspace(1)* %out) { 1401; GFX9-LABEL: shuffle_scalar_load_v8i32_0123: 1402; GFX9: ; %bb.0: 1403; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1404; GFX9-NEXT: v_mov_b32_e32 v4, 0 1405; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1406; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 1407; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1408; GFX9-NEXT: v_mov_b32_e32 v0, s4 1409; GFX9-NEXT: v_mov_b32_e32 v1, s5 1410; GFX9-NEXT: v_mov_b32_e32 v2, s6 1411; GFX9-NEXT: v_mov_b32_e32 v3, s7 1412; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 1413; GFX9-NEXT: s_endpgm 1414; 1415; GFX10-LABEL: shuffle_scalar_load_v8i32_0123: 1416; GFX10: ; %bb.0: 1417; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1418; GFX10-NEXT: v_mov_b32_e32 v4, 0 1419; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1420; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 1421; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1422; GFX10-NEXT: v_mov_b32_e32 v0, s4 1423; GFX10-NEXT: v_mov_b32_e32 v1, s5 1424; GFX10-NEXT: v_mov_b32_e32 v2, s6 1425; GFX10-NEXT: v_mov_b32_e32 v3, s7 1426; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 1427; GFX10-NEXT: s_endpgm 1428 %ld8 = load <8 x i32>, <8 x i32> addrspace(4)* %in, align 16 1429 %id = shufflevector <8 x i32> %ld8, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1430 store <4 x i32> %id, <4 x i32> addrspace(1)* %out, align 8 1431 ret void 1432} 1433 1434declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0 1435declare i32 @llvm.amdgcn.workitem.id.x() #0 1436 1437attributes #0 = { nounwind readnone speculatable } 1438