1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s 3 4define <4 x half> @shuffle_v4f16_23uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 5; GFX9-LABEL: shuffle_v4f16_23uu: 6; GFX9: ; %bb.0: 7; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 9; GFX9-NEXT: s_waitcnt vmcnt(0) 10; GFX9-NEXT: s_setpc_b64 s[30:31] 11 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 12 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 13 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 14 ret <4 x half> %shuffle 15} 16 17define <4 x half> @shuffle_v4f16_234u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 18; GFX9-LABEL: shuffle_v4f16_234u: 19; GFX9: ; %bb.0: 20; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 22; GFX9-NEXT: s_waitcnt vmcnt(0) 23; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 24; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 25; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 26; GFX9-NEXT: v_mov_b32_e32 v1, v4 27; GFX9-NEXT: s_waitcnt vmcnt(0) 28; GFX9-NEXT: v_mov_b32_e32 v0, v5 29; GFX9-NEXT: s_setpc_b64 s[30:31] 30 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 31 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 32 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef> 33 ret <4 x half> %shuffle 34} 35 36define <4 x half> @shuffle_v4f16_u1u3(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 37; GFX9-LABEL: shuffle_v4f16_u1u3: 38; GFX9: ; %bb.0: 39; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 41; GFX9-NEXT: s_waitcnt vmcnt(0) 42; GFX9-NEXT: s_setpc_b64 s[30:31] 43 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 44 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 45 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 1, i32 undef, i32 3> 46 ret <4 x half> %shuffle 47} 48 49define <4 x half> @shuffle_v4f16_u3u1(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 50; GFX9-LABEL: shuffle_v4f16_u3u1: 51; GFX9: ; %bb.0: 52; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 53; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 54; GFX9-NEXT: s_waitcnt vmcnt(0) 55; GFX9-NEXT: v_mov_b32_e32 v0, v2 56; GFX9-NEXT: s_setpc_b64 s[30:31] 57 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 58 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 59 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 1> 60 ret <4 x half> %shuffle 61} 62 63define <4 x half> @shuffle_v4f16_u3uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 64; GFX9-LABEL: shuffle_v4f16_u3uu: 65; GFX9: ; %bb.0: 66; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 67; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 68; GFX9-NEXT: s_waitcnt vmcnt(0) 69; GFX9-NEXT: s_setpc_b64 s[30:31] 70 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 71 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 72 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef> 73 ret <4 x half> %shuffle 74} 75 76define <4 x half> @shuffle_v4f16_3u6u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 77; GFX9-LABEL: shuffle_v4f16_3u6u: 78; GFX9: ; %bb.0: 79; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 80; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 81; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 82; GFX9-NEXT: s_waitcnt vmcnt(1) 83; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 84; GFX9-NEXT: s_waitcnt vmcnt(0) 85; GFX9-NEXT: v_mov_b32_e32 v1, v4 86; GFX9-NEXT: s_setpc_b64 s[30:31] 87 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 88 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 89 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 6, i32 undef> 90 ret <4 x half> %shuffle 91} 92 93define <4 x half> @shuffle_v4f16_3uu7(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 94; GFX9-LABEL: shuffle_v4f16_3uu7: 95; GFX9: ; %bb.0: 96; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 97; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 98; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 99; GFX9-NEXT: s_waitcnt vmcnt(1) 100; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 101; GFX9-NEXT: s_waitcnt vmcnt(0) 102; GFX9-NEXT: v_mov_b32_e32 v1, v4 103; GFX9-NEXT: s_setpc_b64 s[30:31] 104 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 105 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 106 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 undef, i32 7> 107 ret <4 x half> %shuffle 108} 109 110define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 111; GFX9-LABEL: shuffle_v4f16_35u5: 112; GFX9: ; %bb.0: 113; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 114; GFX9-NEXT: global_load_dword v4, v[2:3], off 115; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 116; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 117; GFX9-NEXT: s_waitcnt vmcnt(1) 118; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v4 119; GFX9-NEXT: s_waitcnt vmcnt(0) 120; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 121; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 122; GFX9-NEXT: v_mov_b32_e32 v1, v4 123; GFX9-NEXT: s_setpc_b64 s[30:31] 124 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 125 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 126 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 undef, i32 5> 127 ret <4 x half> %shuffle 128} 129 130define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 131; GFX9-LABEL: shuffle_v4f16_357u: 132; GFX9: ; %bb.0: 133; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 134; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 135; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 136; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 137; GFX9-NEXT: s_waitcnt vmcnt(1) 138; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4 139; GFX9-NEXT: s_waitcnt vmcnt(0) 140; GFX9-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 141; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5 142; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 143; GFX9-NEXT: s_setpc_b64 s[30:31] 144 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 145 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 146 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 undef> 147 ret <4 x half> %shuffle 148} 149 150define <4 x half> @shuffle_v4f16_0101(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 151; GFX9-LABEL: shuffle_v4f16_0101: 152; GFX9: ; %bb.0: 153; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 154; GFX9-NEXT: global_load_dword v0, v[0:1], off 155; GFX9-NEXT: s_waitcnt vmcnt(0) 156; GFX9-NEXT: v_mov_b32_e32 v1, v0 157; GFX9-NEXT: s_setpc_b64 s[30:31] 158 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 159 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 160 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 161 ret <4 x half> %shuffle 162} 163 164define <4 x half> @shuffle_v4f16_0123(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 165; GFX9-LABEL: shuffle_v4f16_0123: 166; GFX9: ; %bb.0: 167; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 168; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 169; GFX9-NEXT: s_waitcnt vmcnt(0) 170; GFX9-NEXT: s_setpc_b64 s[30:31] 171 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 172 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 173 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 174 ret <4 x half> %shuffle 175} 176 177define <4 x half> @shuffle_v4f16_0145(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 178; GFX9-LABEL: shuffle_v4f16_0145: 179; GFX9: ; %bb.0: 180; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 181; GFX9-NEXT: global_load_dword v4, v[0:1], off 182; GFX9-NEXT: global_load_dword v5, v[2:3], off 183; GFX9-NEXT: s_waitcnt vmcnt(1) 184; GFX9-NEXT: v_mov_b32_e32 v0, v4 185; GFX9-NEXT: s_waitcnt vmcnt(0) 186; GFX9-NEXT: v_mov_b32_e32 v1, v5 187; GFX9-NEXT: s_setpc_b64 s[30:31] 188 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 189 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 190 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 191 ret <4 x half> %shuffle 192} 193 194define <4 x half> @shuffle_v4f16_0167(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 195; GFX9-LABEL: shuffle_v4f16_0167: 196; GFX9: ; %bb.0: 197; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 198; GFX9-NEXT: global_load_dword v4, v[0:1], off 199; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 200; GFX9-NEXT: s_waitcnt vmcnt(1) 201; GFX9-NEXT: v_mov_b32_e32 v0, v4 202; GFX9-NEXT: s_waitcnt vmcnt(0) 203; GFX9-NEXT: v_mov_b32_e32 v1, v5 204; GFX9-NEXT: s_setpc_b64 s[30:31] 205 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 206 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 207 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 208 ret <4 x half> %shuffle 209} 210 211define <4 x half> @shuffle_v4f16_2301(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 212; GFX9-LABEL: shuffle_v4f16_2301: 213; GFX9: ; %bb.0: 214; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 215; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 216; GFX9-NEXT: s_waitcnt vmcnt(0) 217; GFX9-NEXT: v_mov_b32_e32 v0, v2 218; GFX9-NEXT: s_setpc_b64 s[30:31] 219 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 220 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 221 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 222 ret <4 x half> %shuffle 223} 224 225define <4 x half> @shuffle_v4f16_2323(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 226; GFX9-LABEL: shuffle_v4f16_2323: 227; GFX9: ; %bb.0: 228; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 229; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 230; GFX9-NEXT: s_waitcnt vmcnt(0) 231; GFX9-NEXT: v_mov_b32_e32 v1, v0 232; GFX9-NEXT: s_setpc_b64 s[30:31] 233 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 234 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 235 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 2, i32 3> 236 ret <4 x half> %shuffle 237} 238 239define <4 x half> @shuffle_v4f16_2345(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 240; GFX9-LABEL: shuffle_v4f16_2345: 241; GFX9: ; %bb.0: 242; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 243; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 244; GFX9-NEXT: global_load_dword v5, v[2:3], off 245; GFX9-NEXT: s_waitcnt vmcnt(1) 246; GFX9-NEXT: v_mov_b32_e32 v0, v4 247; GFX9-NEXT: s_waitcnt vmcnt(0) 248; GFX9-NEXT: v_mov_b32_e32 v1, v5 249; GFX9-NEXT: s_setpc_b64 s[30:31] 250 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 251 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 252 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 253 ret <4 x half> %shuffle 254} 255 256define <4 x half> @shuffle_v4f16_2367(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 257; GFX9-LABEL: shuffle_v4f16_2367: 258; GFX9: ; %bb.0: 259; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 260; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 261; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 262; GFX9-NEXT: s_waitcnt vmcnt(1) 263; GFX9-NEXT: v_mov_b32_e32 v0, v4 264; GFX9-NEXT: s_waitcnt vmcnt(0) 265; GFX9-NEXT: v_mov_b32_e32 v1, v5 266; GFX9-NEXT: s_setpc_b64 s[30:31] 267 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 268 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 269 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 270 ret <4 x half> %shuffle 271} 272 273define <4 x half> @shuffle_v4f16_4501(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 274; GFX9-LABEL: shuffle_v4f16_4501: 275; GFX9: ; %bb.0: 276; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 277; GFX9-NEXT: global_load_dword v4, v[2:3], off 278; GFX9-NEXT: global_load_dword v5, v[0:1], off 279; GFX9-NEXT: s_waitcnt vmcnt(1) 280; GFX9-NEXT: v_mov_b32_e32 v0, v4 281; GFX9-NEXT: s_waitcnt vmcnt(0) 282; GFX9-NEXT: v_mov_b32_e32 v1, v5 283; GFX9-NEXT: s_setpc_b64 s[30:31] 284 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 285 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 286 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 287 ret <4 x half> %shuffle 288} 289 290define <4 x half> @shuffle_v4f16_4523(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 291; GFX9-LABEL: shuffle_v4f16_4523: 292; GFX9: ; %bb.0: 293; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 294; GFX9-NEXT: global_load_dword v4, v[2:3], off 295; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 296; GFX9-NEXT: s_waitcnt vmcnt(1) 297; GFX9-NEXT: v_mov_b32_e32 v0, v4 298; GFX9-NEXT: s_waitcnt vmcnt(0) 299; GFX9-NEXT: v_mov_b32_e32 v1, v5 300; GFX9-NEXT: s_setpc_b64 s[30:31] 301 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 302 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 303 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 304 ret <4 x half> %shuffle 305} 306 307define <4 x half> @shuffle_v4f16_4545(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 308; GFX9-LABEL: shuffle_v4f16_4545: 309; GFX9: ; %bb.0: 310; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 311; GFX9-NEXT: global_load_dword v0, v[2:3], off 312; GFX9-NEXT: s_waitcnt vmcnt(0) 313; GFX9-NEXT: v_mov_b32_e32 v1, v0 314; GFX9-NEXT: s_setpc_b64 s[30:31] 315 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 316 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 317 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 4, i32 5> 318 ret <4 x half> %shuffle 319} 320 321define <4 x half> @shuffle_v4f16_4567(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 322; GFX9-LABEL: shuffle_v4f16_4567: 323; GFX9: ; %bb.0: 324; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 325; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off 326; GFX9-NEXT: s_waitcnt vmcnt(0) 327; GFX9-NEXT: s_setpc_b64 s[30:31] 328 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 329 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 330 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 331 ret <4 x half> %shuffle 332} 333 334define <4 x half> @shuffle_v4f16_6701(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 335; GFX9-LABEL: shuffle_v4f16_6701: 336; GFX9: ; %bb.0: 337; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 338; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 339; GFX9-NEXT: global_load_dword v5, v[0:1], off 340; GFX9-NEXT: s_waitcnt vmcnt(1) 341; GFX9-NEXT: v_mov_b32_e32 v0, v4 342; GFX9-NEXT: s_waitcnt vmcnt(0) 343; GFX9-NEXT: v_mov_b32_e32 v1, v5 344; GFX9-NEXT: s_setpc_b64 s[30:31] 345 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 346 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 347 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 348 ret <4 x half> %shuffle 349} 350 351define <4 x half> @shuffle_v4f16_6723(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 352; GFX9-LABEL: shuffle_v4f16_6723: 353; GFX9: ; %bb.0: 354; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 355; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 356; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 357; GFX9-NEXT: s_waitcnt vmcnt(1) 358; GFX9-NEXT: v_mov_b32_e32 v0, v4 359; GFX9-NEXT: s_waitcnt vmcnt(0) 360; GFX9-NEXT: v_mov_b32_e32 v1, v5 361; GFX9-NEXT: s_setpc_b64 s[30:31] 362 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 363 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 364 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 2, i32 3> 365 ret <4 x half> %shuffle 366} 367 368define <4 x half> @shuffle_v4f16_6745(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 369; GFX9-LABEL: shuffle_v4f16_6745: 370; GFX9: ; %bb.0: 371; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 372; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off 373; GFX9-NEXT: s_waitcnt vmcnt(0) 374; GFX9-NEXT: v_mov_b32_e32 v0, v2 375; GFX9-NEXT: s_setpc_b64 s[30:31] 376 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 377 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 378 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 4, i32 5> 379 ret <4 x half> %shuffle 380} 381 382define <4 x half> @shuffle_v4f16_6767(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 383; GFX9-LABEL: shuffle_v4f16_6767: 384; GFX9: ; %bb.0: 385; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 386; GFX9-NEXT: global_load_dword v0, v[2:3], off offset:4 387; GFX9-NEXT: s_waitcnt vmcnt(0) 388; GFX9-NEXT: v_mov_b32_e32 v1, v0 389; GFX9-NEXT: s_setpc_b64 s[30:31] 390 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 391 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 392 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 6, i32 7> 393 ret <4 x half> %shuffle 394} 395 396define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 397; GFX9-LABEL: shuffle_v4f16_2356: 398; GFX9: ; %bb.0: 399; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 400; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 401; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 402; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 403; GFX9-NEXT: s_waitcnt vmcnt(1) 404; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 405; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v0 406; GFX9-NEXT: s_waitcnt vmcnt(0) 407; GFX9-NEXT: v_mov_b32_e32 v0, v4 408; GFX9-NEXT: s_setpc_b64 s[30:31] 409 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 410 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 411 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6> 412 ret <4 x half> %shuffle 413} 414 415define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 416; GFX9-LABEL: shuffle_v4f16_5623: 417; GFX9: ; %bb.0: 418; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 419; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 420; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 421; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 422; GFX9-NEXT: s_waitcnt vmcnt(1) 423; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 424; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 425; GFX9-NEXT: s_waitcnt vmcnt(0) 426; GFX9-NEXT: v_mov_b32_e32 v1, v4 427; GFX9-NEXT: s_setpc_b64 s[30:31] 428 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 429 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 430 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3> 431 ret <4 x half> %shuffle 432} 433 434define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 435; GFX9-LABEL: shuffle_v4f16_3456: 436; GFX9: ; %bb.0: 437; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 438; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 439; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 440; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 441; GFX9-NEXT: s_waitcnt vmcnt(1) 442; GFX9-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 443; GFX9-NEXT: s_waitcnt vmcnt(0) 444; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 445; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v1 446; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v2 447; GFX9-NEXT: s_setpc_b64 s[30:31] 448 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 449 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 450 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6> 451 ret <4 x half> %shuffle 452} 453 454define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 455; GFX9-LABEL: shuffle_v4f16_5634: 456; GFX9: ; %bb.0: 457; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 458; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 459; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 460; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 461; GFX9-NEXT: s_waitcnt vmcnt(0) 462; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 463; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 464; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 465; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 466; GFX9-NEXT: s_setpc_b64 s[30:31] 467 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 468 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 469 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4> 470 ret <4 x half> %shuffle 471} 472 473define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 474; GFX9-LABEL: shuffle_v4f16_5734: 475; GFX9: ; %bb.0: 476; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 477; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 478; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 479; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 480; GFX9-NEXT: s_waitcnt vmcnt(1) 481; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v5 482; GFX9-NEXT: s_waitcnt vmcnt(0) 483; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 484; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 485; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 486; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 487; GFX9-NEXT: s_setpc_b64 s[30:31] 488 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 489 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 490 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4> 491 ret <4 x half> %shuffle 492} 493 494define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) { 495; GFX9-LABEL: shuffle_v4i16_2356: 496; GFX9: ; %bb.0: 497; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 498; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 499; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 500; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 501; GFX9-NEXT: s_waitcnt vmcnt(1) 502; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 503; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v0 504; GFX9-NEXT: s_waitcnt vmcnt(0) 505; GFX9-NEXT: v_mov_b32_e32 v0, v4 506; GFX9-NEXT: s_setpc_b64 s[30:31] 507 %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 508 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 509 %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6> 510 ret <4 x i16> %shuffle 511} 512 513define <4 x i16> @shuffle_v4i16_0167(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) { 514; GFX9-LABEL: shuffle_v4i16_0167: 515; GFX9: ; %bb.0: 516; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 517; GFX9-NEXT: global_load_dword v4, v[0:1], off 518; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 519; GFX9-NEXT: s_waitcnt vmcnt(1) 520; GFX9-NEXT: v_mov_b32_e32 v0, v4 521; GFX9-NEXT: s_waitcnt vmcnt(0) 522; GFX9-NEXT: v_mov_b32_e32 v1, v5 523; GFX9-NEXT: s_setpc_b64 s[30:31] 524 %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 525 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 526 %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 527 ret <4 x i16> %shuffle 528} 529 530define <4 x half> @shuffle_v4f16_0000(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 531; GFX9-LABEL: shuffle_v4f16_0000: 532; GFX9: ; %bb.0: 533; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 534; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 535; GFX9-NEXT: s_waitcnt vmcnt(0) 536; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 537; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 538; GFX9-NEXT: v_mov_b32_e32 v1, v0 539; GFX9-NEXT: s_setpc_b64 s[30:31] 540 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 541 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 542 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> zeroinitializer 543 ret <4 x half> %shuffle 544} 545 546define <4 x half> @shuffle_v4f16_1010(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 547; GFX9-LABEL: shuffle_v4f16_1010: 548; GFX9: ; %bb.0: 549; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 550; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 551; GFX9-NEXT: s_waitcnt vmcnt(0) 552; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 553; GFX9-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 554; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 555; GFX9-NEXT: v_mov_b32_e32 v1, v0 556; GFX9-NEXT: s_setpc_b64 s[30:31] 557 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 558 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 559 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 0, i32 1, i32 0> 560 ret <4 x half> %shuffle 561} 562 563define <4 x half> @shuffle_v4f16_1100(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 564; GFX9-LABEL: shuffle_v4f16_1100: 565; GFX9: ; %bb.0: 566; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 567; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 568; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff 569; GFX9-NEXT: s_waitcnt vmcnt(0) 570; GFX9-NEXT: v_and_b32_e32 v1, v2, v0 571; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 572; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 573; GFX9-NEXT: v_and_b32_e32 v0, v2, v3 574; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 575; GFX9-NEXT: s_setpc_b64 s[30:31] 576 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 577 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 578 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0> 579 ret <4 x half> %shuffle 580} 581 582define <4 x half> @shuffle_v4f16_6161(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 583; GFX9-LABEL: shuffle_v4f16_6161: 584; GFX9: ; %bb.0: 585; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 586; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 587; GFX9-NEXT: global_load_dword v5, v[0:1], off 588; GFX9-NEXT: s_waitcnt vmcnt(1) 589; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v4 590; GFX9-NEXT: s_waitcnt vmcnt(0) 591; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 592; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 593; GFX9-NEXT: v_mov_b32_e32 v1, v0 594; GFX9-NEXT: s_setpc_b64 s[30:31] 595 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 596 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 597 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1> 598 ret <4 x half> %shuffle 599} 600 601define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 602; GFX9-LABEL: shuffle_v4f16_2333: 603; GFX9: ; %bb.0: 604; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 605; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 606; GFX9-NEXT: s_waitcnt vmcnt(0) 607; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 608; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 609; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 610; GFX9-NEXT: s_setpc_b64 s[30:31] 611 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 612 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 613 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3> 614 ret <4 x half> %shuffle 615} 616 617define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 618; GFX9-LABEL: shuffle_v4f16_6667: 619; GFX9: ; %bb.0: 620; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 621; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 622; GFX9-NEXT: s_waitcnt vmcnt(0) 623; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 624; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 625; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 626; GFX9-NEXT: s_setpc_b64 s[30:31] 627 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 628 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 629 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3> 630 ret <4 x half> %shuffle 631} 632 633define <4 x half> @shuffle_v8f16_0101(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 634; GFX9-LABEL: shuffle_v8f16_0101: 635; GFX9: ; %bb.0: 636; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 637; GFX9-NEXT: global_load_dword v0, v[0:1], off 638; GFX9-NEXT: s_waitcnt vmcnt(0) 639; GFX9-NEXT: v_mov_b32_e32 v1, v0 640; GFX9-NEXT: s_setpc_b64 s[30:31] 641 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 642 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 643 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 644 ret <4 x half> %shuffle 645} 646 647define <4 x half> @shuffle_v8f16_0123(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 648; GFX9-LABEL: shuffle_v8f16_0123: 649; GFX9: ; %bb.0: 650; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 651; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 652; GFX9-NEXT: s_waitcnt vmcnt(0) 653; GFX9-NEXT: s_setpc_b64 s[30:31] 654 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 655 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 656 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 657 ret <4 x half> %shuffle 658} 659 660define <4 x half> @shuffle_v8f16_4589(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 661; GFX9-LABEL: shuffle_v8f16_4589: 662; GFX9: ; %bb.0: 663; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 664; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:8 665; GFX9-NEXT: global_load_dword v5, v[2:3], off 666; GFX9-NEXT: s_waitcnt vmcnt(1) 667; GFX9-NEXT: v_mov_b32_e32 v0, v4 668; GFX9-NEXT: s_waitcnt vmcnt(0) 669; GFX9-NEXT: v_mov_b32_e32 v1, v5 670; GFX9-NEXT: s_setpc_b64 s[30:31] 671 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 672 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 673 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 4, i32 5, i32 8, i32 9> 674 ret <4 x half> %shuffle 675} 676 677define <4 x half> @shuffle_v8f16_10_11_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 678; GFX9-LABEL: shuffle_v8f16_10_11_2_3: 679; GFX9: ; %bb.0: 680; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 681; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 682; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 683; GFX9-NEXT: s_waitcnt vmcnt(1) 684; GFX9-NEXT: v_mov_b32_e32 v0, v4 685; GFX9-NEXT: s_waitcnt vmcnt(0) 686; GFX9-NEXT: v_mov_b32_e32 v1, v5 687; GFX9-NEXT: s_setpc_b64 s[30:31] 688 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 689 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 690 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 10, i32 11, i32 2, i32 3> 691 ret <4 x half> %shuffle 692} 693 694define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 695; GFX9-LABEL: shuffle_v8f16_13_14_2_3: 696; GFX9: ; %bb.0: 697; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 698; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8 699; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 700; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 701; GFX9-NEXT: s_waitcnt vmcnt(1) 702; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 703; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 704; GFX9-NEXT: s_waitcnt vmcnt(0) 705; GFX9-NEXT: v_mov_b32_e32 v1, v4 706; GFX9-NEXT: s_setpc_b64 s[30:31] 707 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 708 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 709 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3> 710 ret <4 x half> %shuffle 711} 712 713define <4 x half> @shuffle_v3f16_0122(<3 x half> addrspace(1)* %arg0, <3 x half> addrspace(1)* %arg1) { 714; GFX9-LABEL: shuffle_v3f16_0122: 715; GFX9: ; %bb.0: 716; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 717; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 718; GFX9-NEXT: s_waitcnt vmcnt(0) 719; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 720; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 721; GFX9-NEXT: s_setpc_b64 s[30:31] 722 %val0 = load <3 x half>, <3 x half> addrspace(1)* %arg0 723 %val1 = load <3 x half>, <3 x half> addrspace(1)* %arg1 724 %shuffle = shufflevector <3 x half> %val0, <3 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 725 ret <4 x half> %shuffle 726} 727 728define <4 x half> @shuffle_v2f16_0122(<2 x half> addrspace(1)* %arg0, <2 x half> addrspace(1)* %arg1) { 729; GFX9-LABEL: shuffle_v2f16_0122: 730; GFX9: ; %bb.0: 731; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 732; GFX9-NEXT: global_load_dword v0, v[0:1], off 733; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 734; GFX9-NEXT: s_waitcnt vmcnt(0) 735; GFX9-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 736; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 737; GFX9-NEXT: s_setpc_b64 s[30:31] 738 %val0 = load <2 x half>, <2 x half> addrspace(1)* %arg0 739 %val1 = load <2 x half>, <2 x half> addrspace(1)* %arg1 740 %shuffle = shufflevector <2 x half> %val0, <2 x half> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0> 741 ret <4 x half> %shuffle 742} 743 744define <6 x half> @shuffle_v6f16_452367(<6 x half> addrspace(1)* %arg0, <6 x half> addrspace(1)* %arg1) { 745; GFX9-LABEL: shuffle_v6f16_452367: 746; GFX9: ; %bb.0: 747; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 748; GFX9-NEXT: v_mov_b32_e32 v6, v1 749; GFX9-NEXT: v_mov_b32_e32 v5, v0 750; GFX9-NEXT: v_mov_b32_e32 v4, v3 751; GFX9-NEXT: v_mov_b32_e32 v3, v2 752; GFX9-NEXT: global_load_dwordx3 v[0:2], v[5:6], off 753; GFX9-NEXT: global_load_dword v7, v[3:4], off 754; GFX9-NEXT: s_waitcnt vmcnt(1) 755; GFX9-NEXT: v_mov_b32_e32 v0, v2 756; GFX9-NEXT: s_waitcnt vmcnt(0) 757; GFX9-NEXT: v_mov_b32_e32 v2, v7 758; GFX9-NEXT: s_setpc_b64 s[30:31] 759 %val0 = load <6 x half>, <6 x half> addrspace(1)* %arg0 760 %val1 = load <6 x half>, <6 x half> addrspace(1)* %arg1 761 %shuffle = shufflevector <6 x half> %val0, <6 x half> %val1, <6 x i32> <i32 4, i32 5, i32 2, i32 3, i32 6, i32 7> 762 ret <6 x half> %shuffle 763} 764 765define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readonly %A, <4 x half> addrspace(1)* nocapture readonly %B, <4 x half> addrspace(1)* nocapture %C) { 766; GFX9-LABEL: fma_shuffle: 767; GFX9: ; %bb.0: ; %entry 768; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 769; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 770; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0 771; GFX9-NEXT: s_waitcnt lgkmcnt(0) 772; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] 773; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] 774; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[8:9] 775; GFX9-NEXT: s_waitcnt vmcnt(0) 776; GFX9-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] 777; GFX9-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] 778; GFX9-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] 779; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] 780; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[8:9] 781; GFX9-NEXT: s_endpgm 782entry: 783 %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() 784 %tmp12 = zext i32 %tmp1 to i64 785 %arrayidx = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %A, i64 %tmp12 786 %tmp14 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx, align 8 787 %arrayidx1 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %B, i64 %tmp12 788 %tmp15 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx1, align 8 789 %arrayidx2 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %C, i64 %tmp12 790 %tmp16 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx2, align 8 791 %tmp17 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> zeroinitializer 792 %tmp18 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 0, i32 1> 793 %tmp19 = shufflevector <4 x half> %tmp16, <4 x half> undef, <2 x i32> <i32 0, i32 1> 794 %tmp20 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp17, <2 x half> %tmp18, <2 x half> %tmp19) 795 %tmp21 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 1, i32 1> 796 %tmp22 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 2, i32 3> 797 %tmp23 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp21, <2 x half> %tmp22, <2 x half> %tmp20) 798 %tmp24 = shufflevector <2 x half> %tmp23, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 799 %tmp25 = shufflevector <4 x half> %tmp24, <4 x half> %tmp16, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 800 %tmp26 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 2, i32 2> 801 %tmp27 = shufflevector <4 x half> %tmp25, <4 x half> undef, <2 x i32> <i32 2, i32 3> 802 %tmp28 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp26, <2 x half> %tmp18, <2 x half> %tmp27) 803 %tmp29 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 3, i32 3> 804 %tmp30 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp29, <2 x half> %tmp22, <2 x half> %tmp28) 805 %tmp31 = shufflevector <2 x half> %tmp30, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 806 %tmp32 = shufflevector <4 x half> %tmp25, <4 x half> %tmp31, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 807 store <4 x half> %tmp32, <4 x half> addrspace(1)* %arrayidx2, align 8 808 ret void 809} 810 811define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 812; GFX9-LABEL: shuffle_v4f16_0456: 813; GFX9: ; %bb.0: 814; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 815; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off 816; GFX9-NEXT: s_waitcnt vmcnt(0) 817; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 818; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 819; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 820; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 821; GFX9-NEXT: v_and_b32_e32 v1, v0, v4 822; GFX9-NEXT: s_waitcnt vmcnt(0) 823; GFX9-NEXT: v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 824; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v1 825; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v2 826; GFX9-NEXT: s_setpc_b64 s[30:31] 827 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 828 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 829 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6> 830 ret <4 x half> %shuffle 831} 832 833define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(<8 x i32> addrspace(4)* %in, <4 x i32> addrspace(1)* %out) { 834; GFX9-LABEL: shuffle_scalar_load_v8i32_0123: 835; GFX9: ; %bb.0: 836; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 837; GFX9-NEXT: v_mov_b32_e32 v4, 0 838; GFX9-NEXT: s_waitcnt lgkmcnt(0) 839; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 840; GFX9-NEXT: s_waitcnt lgkmcnt(0) 841; GFX9-NEXT: v_mov_b32_e32 v0, s4 842; GFX9-NEXT: v_mov_b32_e32 v1, s5 843; GFX9-NEXT: v_mov_b32_e32 v2, s6 844; GFX9-NEXT: v_mov_b32_e32 v3, s7 845; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 846; GFX9-NEXT: s_endpgm 847 %ld8 = load <8 x i32>, <8 x i32> addrspace(4)* %in, align 16 848 %id = shufflevector <8 x i32> %ld8, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 849 store <4 x i32> %id, <4 x i32> addrspace(1)* %out, align 8 850 ret void 851} 852 853declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0 854declare i32 @llvm.amdgcn.workitem.id.x() #0 855 856attributes #0 = { nounwind readnone speculatable } 857