1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s 3 4define <4 x half> @shuffle_v4f16_23uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 5; GFX9-LABEL: shuffle_v4f16_23uu: 6; GFX9: ; %bb.0: 7; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 9; GFX9-NEXT: s_waitcnt vmcnt(0) 10; GFX9-NEXT: v_mov_b32_e32 v0, v1 11; GFX9-NEXT: s_setpc_b64 s[30:31] 12 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 13 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 14 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 15 ret <4 x half> %shuffle 16} 17 18define <4 x half> @shuffle_v4f16_234u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 19; GFX9-LABEL: shuffle_v4f16_234u: 20; GFX9: ; %bb.0: 21; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 22; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off 23; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 24; GFX9-NEXT: s_waitcnt vmcnt(0) 25; GFX9-NEXT: v_mov_b32_e32 v0, v1 26; GFX9-NEXT: v_mov_b32_e32 v1, v2 27; GFX9-NEXT: s_setpc_b64 s[30:31] 28 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 29 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 30 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef> 31 ret <4 x half> %shuffle 32} 33 34define <4 x half> @shuffle_v4f16_u1u3(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 35; GFX9-LABEL: shuffle_v4f16_u1u3: 36; GFX9: ; %bb.0: 37; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 38; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 39; GFX9-NEXT: s_waitcnt vmcnt(0) 40; GFX9-NEXT: s_setpc_b64 s[30:31] 41 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 42 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 43 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 1, i32 undef, i32 3> 44 ret <4 x half> %shuffle 45} 46 47define <4 x half> @shuffle_v4f16_u3u1(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 48; GFX9-LABEL: shuffle_v4f16_u3u1: 49; GFX9: ; %bb.0: 50; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 51; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 52; GFX9-NEXT: s_waitcnt vmcnt(0) 53; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 54; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 55; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1 56; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v2 57; GFX9-NEXT: s_setpc_b64 s[30:31] 58 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 59 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 60 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 1> 61 ret <4 x half> %shuffle 62} 63 64define <4 x half> @shuffle_v4f16_u3uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 65; GFX9-LABEL: shuffle_v4f16_u3uu: 66; GFX9: ; %bb.0: 67; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 68; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 69; GFX9-NEXT: s_waitcnt vmcnt(0) 70; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 71; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 72; GFX9-NEXT: s_setpc_b64 s[30:31] 73 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 74 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 75 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef> 76 ret <4 x half> %shuffle 77} 78 79define <4 x half> @shuffle_v4f16_3u6u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 80; GFX9-LABEL: shuffle_v4f16_3u6u: 81; GFX9: ; %bb.0: 82; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 83; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 84; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 85; GFX9-NEXT: s_waitcnt vmcnt(1) 86; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 87; GFX9-NEXT: s_waitcnt vmcnt(0) 88; GFX9-NEXT: s_setpc_b64 s[30:31] 89 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 90 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 91 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 6, i32 undef> 92 ret <4 x half> %shuffle 93} 94 95define <4 x half> @shuffle_v4f16_3uu7(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 96; GFX9-LABEL: shuffle_v4f16_3uu7: 97; GFX9: ; %bb.0: 98; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 99; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 100; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 101; GFX9-NEXT: s_waitcnt vmcnt(1) 102; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2 103; GFX9-NEXT: s_waitcnt vmcnt(0) 104; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 105; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 106; GFX9-NEXT: s_setpc_b64 s[30:31] 107 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 108 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 109 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 undef, i32 7> 110 ret <4 x half> %shuffle 111} 112 113define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 114; GFX9-LABEL: shuffle_v4f16_35u5: 115; GFX9: ; %bb.0: 116; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 117; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 118; GFX9-NEXT: global_load_dword v1, v[2:3], off 119; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff 120; GFX9-NEXT: s_waitcnt vmcnt(1) 121; GFX9-NEXT: v_and_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 122; GFX9-NEXT: s_waitcnt vmcnt(0) 123; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 124; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 125; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 126; GFX9-NEXT: s_setpc_b64 s[30:31] 127 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 128 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 129 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 undef, i32 5> 130 ret <4 x half> %shuffle 131} 132 133define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 134; GFX9-LABEL: shuffle_v4f16_357u: 135; GFX9: ; %bb.0: 136; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 137; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 138; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off 139; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff 140; GFX9-NEXT: s_waitcnt vmcnt(1) 141; GFX9-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 142; GFX9-NEXT: s_waitcnt vmcnt(0) 143; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 144; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 145; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 146; GFX9-NEXT: s_setpc_b64 s[30:31] 147 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 148 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 149 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 undef> 150 ret <4 x half> %shuffle 151} 152 153define <4 x half> @shuffle_v4f16_0101(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 154; GFX9-LABEL: shuffle_v4f16_0101: 155; GFX9: ; %bb.0: 156; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 157; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 158; GFX9-NEXT: s_waitcnt vmcnt(0) 159; GFX9-NEXT: v_mov_b32_e32 v1, v0 160; GFX9-NEXT: s_setpc_b64 s[30:31] 161 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 162 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 163 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 164 ret <4 x half> %shuffle 165} 166 167define <4 x half> @shuffle_v4f16_0123(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 168; GFX9-LABEL: shuffle_v4f16_0123: 169; GFX9: ; %bb.0: 170; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 171; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 172; GFX9-NEXT: s_waitcnt vmcnt(0) 173; GFX9-NEXT: s_setpc_b64 s[30:31] 174 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 175 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 176 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 177 ret <4 x half> %shuffle 178} 179 180define <4 x half> @shuffle_v4f16_0145(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 181; GFX9-LABEL: shuffle_v4f16_0145: 182; GFX9: ; %bb.0: 183; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 184; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off 185; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 186; GFX9-NEXT: s_waitcnt vmcnt(0) 187; GFX9-NEXT: v_mov_b32_e32 v1, v2 188; GFX9-NEXT: s_setpc_b64 s[30:31] 189 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 190 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 191 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 192 ret <4 x half> %shuffle 193} 194 195define <4 x half> @shuffle_v4f16_0167(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 196; GFX9-LABEL: shuffle_v4f16_0167: 197; GFX9: ; %bb.0: 198; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 199; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off 200; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 201; GFX9-NEXT: s_waitcnt vmcnt(0) 202; GFX9-NEXT: v_mov_b32_e32 v1, v3 203; GFX9-NEXT: s_setpc_b64 s[30:31] 204 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 205 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 206 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 207 ret <4 x half> %shuffle 208} 209 210define <4 x half> @shuffle_v4f16_2301(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 211; GFX9-LABEL: shuffle_v4f16_2301: 212; GFX9: ; %bb.0: 213; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 214; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 215; GFX9-NEXT: s_waitcnt vmcnt(0) 216; GFX9-NEXT: v_mov_b32_e32 v0, v2 217; GFX9-NEXT: s_setpc_b64 s[30:31] 218 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 219 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 220 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 221 ret <4 x half> %shuffle 222} 223 224define <4 x half> @shuffle_v4f16_2323(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 225; GFX9-LABEL: shuffle_v4f16_2323: 226; GFX9: ; %bb.0: 227; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 228; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 229; GFX9-NEXT: s_waitcnt vmcnt(0) 230; GFX9-NEXT: v_mov_b32_e32 v0, v1 231; GFX9-NEXT: s_setpc_b64 s[30:31] 232 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 233 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 234 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 2, i32 3> 235 ret <4 x half> %shuffle 236} 237 238define <4 x half> @shuffle_v4f16_2345(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 239; GFX9-LABEL: shuffle_v4f16_2345: 240; GFX9: ; %bb.0: 241; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 242; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off 243; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 244; GFX9-NEXT: s_waitcnt vmcnt(0) 245; GFX9-NEXT: v_mov_b32_e32 v0, v1 246; GFX9-NEXT: v_mov_b32_e32 v1, v2 247; GFX9-NEXT: s_setpc_b64 s[30:31] 248 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 249 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 250 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 251 ret <4 x half> %shuffle 252} 253 254define <4 x half> @shuffle_v4f16_2367(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 255; GFX9-LABEL: shuffle_v4f16_2367: 256; GFX9: ; %bb.0: 257; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 258; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off 259; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 260; GFX9-NEXT: s_waitcnt vmcnt(0) 261; GFX9-NEXT: v_mov_b32_e32 v0, v1 262; GFX9-NEXT: v_mov_b32_e32 v1, v3 263; GFX9-NEXT: s_setpc_b64 s[30:31] 264 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 265 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 266 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 267 ret <4 x half> %shuffle 268} 269 270define <4 x half> @shuffle_v4f16_4501(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 271; GFX9-LABEL: shuffle_v4f16_4501: 272; GFX9: ; %bb.0: 273; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 274; GFX9-NEXT: global_load_dwordx2 v[3:4], v[2:3], off 275; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 276; GFX9-NEXT: s_waitcnt vmcnt(1) 277; GFX9-NEXT: v_mov_b32_e32 v0, v3 278; GFX9-NEXT: s_waitcnt vmcnt(0) 279; GFX9-NEXT: s_setpc_b64 s[30:31] 280 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 281 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 282 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 283 ret <4 x half> %shuffle 284} 285 286define <4 x half> @shuffle_v4f16_4523(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 287; GFX9-LABEL: shuffle_v4f16_4523: 288; GFX9: ; %bb.0: 289; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 290; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off 291; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 292; GFX9-NEXT: s_waitcnt vmcnt(0) 293; GFX9-NEXT: v_mov_b32_e32 v0, v2 294; GFX9-NEXT: s_setpc_b64 s[30:31] 295 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 296 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 297 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 298 ret <4 x half> %shuffle 299} 300 301define <4 x half> @shuffle_v4f16_4545(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 302; GFX9-LABEL: shuffle_v4f16_4545: 303; GFX9: ; %bb.0: 304; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 305; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off 306; GFX9-NEXT: s_waitcnt vmcnt(0) 307; GFX9-NEXT: v_mov_b32_e32 v1, v0 308; GFX9-NEXT: s_setpc_b64 s[30:31] 309 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 310 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 311 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 4, i32 5> 312 ret <4 x half> %shuffle 313} 314 315define <4 x half> @shuffle_v4f16_4567(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 316; GFX9-LABEL: shuffle_v4f16_4567: 317; GFX9: ; %bb.0: 318; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 319; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off 320; GFX9-NEXT: s_waitcnt vmcnt(0) 321; GFX9-NEXT: s_setpc_b64 s[30:31] 322 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 323 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 324 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 325 ret <4 x half> %shuffle 326} 327 328define <4 x half> @shuffle_v4f16_6701(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 329; GFX9-LABEL: shuffle_v4f16_6701: 330; GFX9: ; %bb.0: 331; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 332; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off 333; GFX9-NEXT: s_waitcnt vmcnt(0) 334; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 335; GFX9-NEXT: v_mov_b32_e32 v0, v3 336; GFX9-NEXT: s_waitcnt vmcnt(0) 337; GFX9-NEXT: s_setpc_b64 s[30:31] 338 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 339 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 340 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 341 ret <4 x half> %shuffle 342} 343 344define <4 x half> @shuffle_v4f16_6723(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 345; GFX9-LABEL: shuffle_v4f16_6723: 346; GFX9: ; %bb.0: 347; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 348; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off 349; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 350; GFX9-NEXT: s_waitcnt vmcnt(0) 351; GFX9-NEXT: v_mov_b32_e32 v0, v3 352; GFX9-NEXT: s_setpc_b64 s[30:31] 353 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 354 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 355 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 2, i32 3> 356 ret <4 x half> %shuffle 357} 358 359define <4 x half> @shuffle_v4f16_6745(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 360; GFX9-LABEL: shuffle_v4f16_6745: 361; GFX9: ; %bb.0: 362; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 363; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off 364; GFX9-NEXT: s_waitcnt vmcnt(0) 365; GFX9-NEXT: v_mov_b32_e32 v0, v2 366; GFX9-NEXT: s_setpc_b64 s[30:31] 367 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 368 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 369 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 4, i32 5> 370 ret <4 x half> %shuffle 371} 372 373define <4 x half> @shuffle_v4f16_6767(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 374; GFX9-LABEL: shuffle_v4f16_6767: 375; GFX9: ; %bb.0: 376; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 377; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off 378; GFX9-NEXT: s_waitcnt vmcnt(0) 379; GFX9-NEXT: v_mov_b32_e32 v0, v1 380; GFX9-NEXT: s_setpc_b64 s[30:31] 381 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 382 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 383 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 6, i32 7> 384 ret <4 x half> %shuffle 385} 386 387define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 388; GFX9-LABEL: shuffle_v4f16_2356: 389; GFX9: ; %bb.0: 390; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 391; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off 392; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off 393; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff 394; GFX9-NEXT: s_waitcnt vmcnt(1) 395; GFX9-NEXT: v_and_b32_sdwa v0, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 396; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v0 397; GFX9-NEXT: s_waitcnt vmcnt(0) 398; GFX9-NEXT: v_mov_b32_e32 v0, v5 399; GFX9-NEXT: s_setpc_b64 s[30:31] 400 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 401 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 402 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6> 403 ret <4 x half> %shuffle 404} 405 406define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 407; GFX9-LABEL: shuffle_v4f16_5623: 408; GFX9: ; %bb.0: 409; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 410; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off 411; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 412; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff 413; GFX9-NEXT: s_waitcnt vmcnt(0) 414; GFX9-NEXT: v_and_b32_sdwa v0, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 415; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 416; GFX9-NEXT: s_setpc_b64 s[30:31] 417 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 418 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 419 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3> 420 ret <4 x half> %shuffle 421} 422 423define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 424; GFX9-LABEL: shuffle_v4f16_3456: 425; GFX9: ; %bb.0: 426; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 427; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 428; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off 429; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff 430; GFX9-NEXT: s_waitcnt vmcnt(1) 431; GFX9-NEXT: v_and_b32_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 432; GFX9-NEXT: s_waitcnt vmcnt(0) 433; GFX9-NEXT: v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 434; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3 435; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 436; GFX9-NEXT: s_setpc_b64 s[30:31] 437 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 438 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 439 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6> 440 ret <4 x half> %shuffle 441} 442 443define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 444; GFX9-LABEL: shuffle_v4f16_5634: 445; GFX9: ; %bb.0: 446; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 447; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 448; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off 449; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff 450; GFX9-NEXT: s_waitcnt vmcnt(1) 451; GFX9-NEXT: v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 452; GFX9-NEXT: s_waitcnt vmcnt(0) 453; GFX9-NEXT: v_and_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 454; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 455; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v3 456; GFX9-NEXT: s_setpc_b64 s[30:31] 457 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 458 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 459 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4> 460 ret <4 x half> %shuffle 461} 462 463define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 464; GFX9-LABEL: shuffle_v4f16_5734: 465; GFX9: ; %bb.0: 466; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 467; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 468; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off 469; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff 470; GFX9-NEXT: s_waitcnt vmcnt(1) 471; GFX9-NEXT: v_and_b32_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 472; GFX9-NEXT: s_waitcnt vmcnt(0) 473; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 474; GFX9-NEXT: v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 475; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v3 476; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v2 477; GFX9-NEXT: s_setpc_b64 s[30:31] 478 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 479 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 480 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4> 481 ret <4 x half> %shuffle 482} 483 484define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) { 485; GFX9-LABEL: shuffle_v4i16_2356: 486; GFX9: ; %bb.0: 487; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 488; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off 489; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off 490; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff 491; GFX9-NEXT: s_waitcnt vmcnt(1) 492; GFX9-NEXT: v_and_b32_sdwa v0, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 493; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v0 494; GFX9-NEXT: s_waitcnt vmcnt(0) 495; GFX9-NEXT: v_mov_b32_e32 v0, v5 496; GFX9-NEXT: s_setpc_b64 s[30:31] 497 %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 498 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 499 %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6> 500 ret <4 x i16> %shuffle 501} 502 503define <4 x i16> @shuffle_v4i16_0167(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) { 504; GFX9-LABEL: shuffle_v4i16_0167: 505; GFX9: ; %bb.0: 506; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 507; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off 508; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 509; GFX9-NEXT: s_waitcnt vmcnt(0) 510; GFX9-NEXT: v_mov_b32_e32 v1, v3 511; GFX9-NEXT: s_setpc_b64 s[30:31] 512 %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 513 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 514 %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 515 ret <4 x i16> %shuffle 516} 517 518define <4 x half> @shuffle_v4f16_0000(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 519; GFX9-LABEL: shuffle_v4f16_0000: 520; GFX9: ; %bb.0: 521; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 522; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 523; GFX9-NEXT: s_waitcnt vmcnt(0) 524; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 525; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 526; GFX9-NEXT: v_mov_b32_e32 v1, v0 527; GFX9-NEXT: s_setpc_b64 s[30:31] 528 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 529 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 530 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> zeroinitializer 531 ret <4 x half> %shuffle 532} 533 534define <4 x half> @shuffle_v4f16_1010(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 535; GFX9-LABEL: shuffle_v4f16_1010: 536; GFX9: ; %bb.0: 537; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 538; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 539; GFX9-NEXT: s_waitcnt vmcnt(0) 540; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 541; GFX9-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 542; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 543; GFX9-NEXT: v_mov_b32_e32 v1, v0 544; GFX9-NEXT: s_setpc_b64 s[30:31] 545 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 546 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 547 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 0, i32 1, i32 0> 548 ret <4 x half> %shuffle 549} 550 551define <4 x half> @shuffle_v4f16_1100(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 552; GFX9-LABEL: shuffle_v4f16_1100: 553; GFX9: ; %bb.0: 554; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 555; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 556; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff 557; GFX9-NEXT: s_waitcnt vmcnt(0) 558; GFX9-NEXT: v_and_b32_e32 v1, v2, v0 559; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 560; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 561; GFX9-NEXT: v_and_b32_e32 v0, v2, v3 562; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 563; GFX9-NEXT: s_setpc_b64 s[30:31] 564 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 565 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 566 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0> 567 ret <4 x half> %shuffle 568} 569 570define <4 x half> @shuffle_v4f16_6161(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 571; GFX9-LABEL: shuffle_v4f16_6161: 572; GFX9: ; %bb.0: 573; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 574; GFX9-NEXT: global_load_dword v0, v[0:1], off 575; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 576; GFX9-NEXT: s_waitcnt vmcnt(1) 577; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 578; GFX9-NEXT: s_waitcnt vmcnt(0) 579; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 580; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 581; GFX9-NEXT: v_mov_b32_e32 v1, v0 582; GFX9-NEXT: s_setpc_b64 s[30:31] 583 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 584 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 585 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1> 586 ret <4 x half> %shuffle 587} 588 589define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 590; GFX9-LABEL: shuffle_v4f16_2333: 591; GFX9: ; %bb.0: 592; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 593; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 594; GFX9-NEXT: s_waitcnt vmcnt(0) 595; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v2 596; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 597; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 598; GFX9-NEXT: v_mov_b32_e32 v0, v2 599; GFX9-NEXT: s_setpc_b64 s[30:31] 600 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 601 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 602 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3> 603 ret <4 x half> %shuffle 604} 605 606define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 607; GFX9-LABEL: shuffle_v4f16_6667: 608; GFX9: ; %bb.0: 609; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 610; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 611; GFX9-NEXT: s_waitcnt vmcnt(0) 612; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v2 613; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 614; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 615; GFX9-NEXT: v_mov_b32_e32 v0, v2 616; GFX9-NEXT: s_setpc_b64 s[30:31] 617 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 618 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 619 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3> 620 ret <4 x half> %shuffle 621} 622 623define <4 x half> @shuffle_v8f16_0101(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 624; GFX9-LABEL: shuffle_v8f16_0101: 625; GFX9: ; %bb.0: 626; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 627; GFX9-NEXT: global_load_dword v0, v[0:1], off 628; GFX9-NEXT: s_waitcnt vmcnt(0) 629; GFX9-NEXT: v_mov_b32_e32 v1, v0 630; GFX9-NEXT: s_setpc_b64 s[30:31] 631 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 632 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 633 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 634 ret <4 x half> %shuffle 635} 636 637define <4 x half> @shuffle_v8f16_0123(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 638; GFX9-LABEL: shuffle_v8f16_0123: 639; GFX9: ; %bb.0: 640; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 641; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off 642; GFX9-NEXT: s_waitcnt vmcnt(0) 643; GFX9-NEXT: s_setpc_b64 s[30:31] 644 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 645 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 646 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 647 ret <4 x half> %shuffle 648} 649 650define <4 x half> @shuffle_v8f16_4589(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 651; GFX9-LABEL: shuffle_v8f16_4589: 652; GFX9: ; %bb.0: 653; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 654; GFX9-NEXT: global_load_dword v2, v[2:3], off 655; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:8 656; GFX9-NEXT: s_waitcnt vmcnt(1) 657; GFX9-NEXT: v_mov_b32_e32 v1, v2 658; GFX9-NEXT: s_waitcnt vmcnt(0) 659; GFX9-NEXT: s_setpc_b64 s[30:31] 660 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 661 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 662 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 4, i32 5, i32 8, i32 9> 663 ret <4 x half> %shuffle 664} 665 666define <4 x half> @shuffle_v8f16_10_11_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 667; GFX9-LABEL: shuffle_v8f16_10_11_2_3: 668; GFX9: ; %bb.0: 669; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 670; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 671; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 672; GFX9-NEXT: s_waitcnt vmcnt(1) 673; GFX9-NEXT: v_mov_b32_e32 v0, v2 674; GFX9-NEXT: s_waitcnt vmcnt(0) 675; GFX9-NEXT: s_setpc_b64 s[30:31] 676 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 677 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 678 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 10, i32 11, i32 2, i32 3> 679 ret <4 x half> %shuffle 680} 681 682define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 683; GFX9-LABEL: shuffle_v8f16_13_14_2_3: 684; GFX9: ; %bb.0: 685; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 686; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off 687; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 688; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff 689; GFX9-NEXT: s_waitcnt vmcnt(1) 690; GFX9-NEXT: v_and_b32_sdwa v0, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 691; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 692; GFX9-NEXT: s_waitcnt vmcnt(0) 693; GFX9-NEXT: s_setpc_b64 s[30:31] 694 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 695 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 696 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3> 697 ret <4 x half> %shuffle 698} 699 700define <4 x half> @shuffle_v3f16_0122(<3 x half> addrspace(1)* %arg0, <3 x half> addrspace(1)* %arg1) { 701; GFX9-LABEL: shuffle_v3f16_0122: 702; GFX9: ; %bb.0: 703; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 704; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 705; GFX9-NEXT: s_waitcnt vmcnt(0) 706; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 707; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 708; GFX9-NEXT: s_setpc_b64 s[30:31] 709 %val0 = load <3 x half>, <3 x half> addrspace(1)* %arg0 710 %val1 = load <3 x half>, <3 x half> addrspace(1)* %arg1 711 %shuffle = shufflevector <3 x half> %val0, <3 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 712 ret <4 x half> %shuffle 713} 714 715define <4 x half> @shuffle_v2f16_0122(<2 x half> addrspace(1)* %arg0, <2 x half> addrspace(1)* %arg1) { 716; GFX9-LABEL: shuffle_v2f16_0122: 717; GFX9: ; %bb.0: 718; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 719; GFX9-NEXT: global_load_dword v0, v[0:1], off 720; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 721; GFX9-NEXT: s_waitcnt vmcnt(0) 722; GFX9-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 723; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 724; GFX9-NEXT: s_setpc_b64 s[30:31] 725 %val0 = load <2 x half>, <2 x half> addrspace(1)* %arg0 726 %val1 = load <2 x half>, <2 x half> addrspace(1)* %arg1 727 %shuffle = shufflevector <2 x half> %val0, <2 x half> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0> 728 ret <4 x half> %shuffle 729} 730 731define <6 x half> @shuffle_v6f16_452367(<6 x half> addrspace(1)* %arg0, <6 x half> addrspace(1)* %arg1) { 732; GFX9-LABEL: shuffle_v6f16_452367: 733; GFX9: ; %bb.0: 734; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 735; GFX9-NEXT: global_load_dword v3, v[2:3], off 736; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off 737; GFX9-NEXT: s_waitcnt vmcnt(0) 738; GFX9-NEXT: v_mov_b32_e32 v0, v2 739; GFX9-NEXT: v_mov_b32_e32 v2, v3 740; GFX9-NEXT: s_setpc_b64 s[30:31] 741 %val0 = load <6 x half>, <6 x half> addrspace(1)* %arg0 742 %val1 = load <6 x half>, <6 x half> addrspace(1)* %arg1 743 %shuffle = shufflevector <6 x half> %val0, <6 x half> %val1, <6 x i32> <i32 4, i32 5, i32 2, i32 3, i32 6, i32 7> 744 ret <6 x half> %shuffle 745} 746 747define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readonly %A, <4 x half> addrspace(1)* nocapture readonly %B, <4 x half> addrspace(1)* nocapture %C) { 748; GFX9-LABEL: fma_shuffle: 749; GFX9: ; %bb.0: ; %entry 750; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 751; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 752; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 753; GFX9-NEXT: s_waitcnt lgkmcnt(0) 754; GFX9-NEXT: v_mov_b32_e32 v1, s1 755; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v4 756; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 757; GFX9-NEXT: v_mov_b32_e32 v3, s3 758; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 759; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 760; GFX9-NEXT: v_mov_b32_e32 v5, s5 761; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 762; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc 763; GFX9-NEXT: global_load_dwordx2 v[6:7], v[4:5], off 764; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off 765; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 766; GFX9-NEXT: s_waitcnt vmcnt(0) 767; GFX9-NEXT: v_pk_fma_f16 v6, v0, v2, v6 op_sel_hi:[0,1,1] 768; GFX9-NEXT: v_pk_fma_f16 v2, v1, v2, v7 op_sel_hi:[0,1,1] 769; GFX9-NEXT: v_pk_fma_f16 v0, v0, v3, v6 op_sel:[1,0,0] 770; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] 771; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off 772; GFX9-NEXT: s_endpgm 773entry: 774 %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() 775 %tmp12 = zext i32 %tmp1 to i64 776 %arrayidx = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %A, i64 %tmp12 777 %tmp14 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx, align 8 778 %arrayidx1 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %B, i64 %tmp12 779 %tmp15 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx1, align 8 780 %arrayidx2 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %C, i64 %tmp12 781 %tmp16 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx2, align 8 782 %tmp17 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> zeroinitializer 783 %tmp18 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 0, i32 1> 784 %tmp19 = shufflevector <4 x half> %tmp16, <4 x half> undef, <2 x i32> <i32 0, i32 1> 785 %tmp20 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp17, <2 x half> %tmp18, <2 x half> %tmp19) 786 %tmp21 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 1, i32 1> 787 %tmp22 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 2, i32 3> 788 %tmp23 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp21, <2 x half> %tmp22, <2 x half> %tmp20) 789 %tmp24 = shufflevector <2 x half> %tmp23, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 790 %tmp25 = shufflevector <4 x half> %tmp24, <4 x half> %tmp16, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 791 %tmp26 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 2, i32 2> 792 %tmp27 = shufflevector <4 x half> %tmp25, <4 x half> undef, <2 x i32> <i32 2, i32 3> 793 %tmp28 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp26, <2 x half> %tmp18, <2 x half> %tmp27) 794 %tmp29 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 3, i32 3> 795 %tmp30 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp29, <2 x half> %tmp22, <2 x half> %tmp28) 796 %tmp31 = shufflevector <2 x half> %tmp30, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 797 %tmp32 = shufflevector <4 x half> %tmp25, <4 x half> %tmp31, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 798 store <4 x half> %tmp32, <4 x half> addrspace(1)* %arrayidx2, align 8 799 ret void 800} 801 802define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 803; GFX9-LABEL: shuffle_v4f16_0456: 804; GFX9: ; %bb.0: 805; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 806; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off 807; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 808; GFX9-NEXT: s_waitcnt vmcnt(0) 809; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 810; GFX9-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 811; GFX9-NEXT: v_and_b32_e32 v0, v1, v0 812; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 813; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v4 814; GFX9-NEXT: s_setpc_b64 s[30:31] 815 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 816 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 817 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6> 818 ret <4 x half> %shuffle 819} 820 821declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0 822declare i32 @llvm.amdgcn.workitem.id.x() #0 823 824attributes #0 = { nounwind readnone speculatable } 825