1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-XNACK %s 7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-XNACK %s 8; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s 9; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s 10 11define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1, 12; GFX7-LABEL: idot8_acc32: 13; GFX7: ; %bb.0: ; %entry 14; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 15; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 16; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 17; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 18; GFX7-NEXT: s_mov_b32 s14, -1 19; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 20; GFX7-NEXT: s_add_u32 s12, s12, s3 21; GFX7-NEXT: s_mov_b32 s3, 0xf000 22; GFX7-NEXT: s_mov_b32 s10, 0 23; GFX7-NEXT: s_mov_b32 s11, s3 24; GFX7-NEXT: s_waitcnt lgkmcnt(0) 25; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 26; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 27; GFX7-NEXT: v_mov_b32_e32 v1, 0 28; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 29; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 30; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 31; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 32; GFX7-NEXT: s_mov_b32 s2, -1 33; GFX7-NEXT: s_addc_u32 s13, s13, 0 34; GFX7-NEXT: s_waitcnt vmcnt(1) 35; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4 36; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4 37; GFX7-NEXT: s_waitcnt vmcnt(0) 38; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4 39; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4 40; GFX7-NEXT: s_waitcnt lgkmcnt(0) 41; GFX7-NEXT: v_mad_i32_i24 v1, v1, v9, s4 42; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4 43; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4 44; GFX7-NEXT: v_mad_i32_i24 v1, v3, v10, v1 45; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4 46; GFX7-NEXT: v_bfe_i32 v12, v0, 12, 4 47; GFX7-NEXT: v_mad_i32_i24 v1, v4, v11, v1 48; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4 49; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4 50; GFX7-NEXT: v_mad_i32_i24 v1, v5, v12, v1 51; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4 52; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4 53; GFX7-NEXT: v_mad_i32_i24 v1, v6, v13, v1 54; GFX7-NEXT: v_bfe_i32 v8, v2, 24, 4 55; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4 56; GFX7-NEXT: v_mad_i32_i24 v1, v7, v14, v1 57; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 58; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 59; GFX7-NEXT: v_mad_i32_i24 v1, v8, v15, v1 60; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1 61; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 62; GFX7-NEXT: s_endpgm 63; 64; GFX8-LABEL: idot8_acc32: 65; GFX8: ; %bb.0: ; %entry 66; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 67; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 68; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 69; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 70; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 71; GFX8-NEXT: s_waitcnt lgkmcnt(0) 72; GFX8-NEXT: v_mov_b32_e32 v1, s5 73; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 74; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 75; GFX8-NEXT: flat_load_dword v3, v[0:1] 76; GFX8-NEXT: v_mov_b32_e32 v1, s7 77; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 78; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 79; GFX8-NEXT: flat_load_dword v0, v[0:1] 80; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 81; GFX8-NEXT: s_mov_b32 s10, -1 82; GFX8-NEXT: s_mov_b32 s11, 0xe80000 83; GFX8-NEXT: s_add_u32 s8, s8, s3 84; GFX8-NEXT: s_addc_u32 s9, s9, 0 85; GFX8-NEXT: s_waitcnt vmcnt(1) 86; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4 87; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4 88; GFX8-NEXT: v_bfe_i32 v6, v3, 8, 4 89; GFX8-NEXT: v_bfe_i32 v8, v3, 12, 4 90; GFX8-NEXT: v_bfe_i32 v10, v3, 16, 4 91; GFX8-NEXT: v_bfe_i32 v12, v3, 20, 4 92; GFX8-NEXT: s_waitcnt vmcnt(0) 93; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 4 94; GFX8-NEXT: v_bfe_i32 v5, v0, 4, 4 95; GFX8-NEXT: s_waitcnt lgkmcnt(0) 96; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2 97; GFX8-NEXT: v_bfe_i32 v7, v0, 8, 4 98; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1 99; GFX8-NEXT: v_bfe_i32 v9, v0, 12, 4 100; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1 101; GFX8-NEXT: v_bfe_i32 v11, v0, 16, 4 102; GFX8-NEXT: v_mad_i32_i24 v1, v8, v9, v1 103; GFX8-NEXT: v_bfe_i32 v13, v0, 20, 4 104; GFX8-NEXT: v_mad_i32_i24 v1, v10, v11, v1 105; GFX8-NEXT: v_bfe_i32 v14, v3, 24, 4 106; GFX8-NEXT: v_bfe_i32 v15, v0, 24, 4 107; GFX8-NEXT: v_mad_i32_i24 v1, v12, v13, v1 108; GFX8-NEXT: v_ashrrev_i32_e32 v3, 28, v3 109; GFX8-NEXT: v_ashrrev_i32_e32 v0, 28, v0 110; GFX8-NEXT: v_mad_i32_i24 v1, v14, v15, v1 111; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 112; GFX8-NEXT: v_mov_b32_e32 v0, s0 113; GFX8-NEXT: v_mov_b32_e32 v1, s1 114; GFX8-NEXT: flat_store_dword v[0:1], v2 115; GFX8-NEXT: s_endpgm 116; 117; GFX9-LABEL: idot8_acc32: 118; GFX9: ; %bb.0: ; %entry 119; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 120; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 121; GFX9-NEXT: s_mov_b32 s10, -1 122; GFX9-NEXT: s_mov_b32 s11, 0xe00000 123; GFX9-NEXT: s_add_u32 s8, s8, s3 124; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 125; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 126; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 127; GFX9-NEXT: s_addc_u32 s9, s9, 0 128; GFX9-NEXT: s_waitcnt lgkmcnt(0) 129; GFX9-NEXT: global_load_dword v1, v0, s[4:5] 130; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 131; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 132; GFX9-NEXT: v_mov_b32_e32 v0, 0 133; GFX9-NEXT: s_waitcnt vmcnt(1) 134; GFX9-NEXT: v_bfe_i32 v3, v1, 0, 4 135; GFX9-NEXT: s_waitcnt vmcnt(0) 136; GFX9-NEXT: v_bfe_i32 v4, v2, 0, 4 137; GFX9-NEXT: v_bfe_i32 v5, v1, 4, 4 138; GFX9-NEXT: v_bfe_i32 v6, v2, 4, 4 139; GFX9-NEXT: v_bfe_i32 v7, v1, 8, 4 140; GFX9-NEXT: v_bfe_i32 v8, v2, 8, 4 141; GFX9-NEXT: v_bfe_i32 v9, v1, 12, 4 142; GFX9-NEXT: v_bfe_i32 v10, v2, 12, 4 143; GFX9-NEXT: v_bfe_i32 v11, v1, 16, 4 144; GFX9-NEXT: v_bfe_i32 v12, v2, 16, 4 145; GFX9-NEXT: v_bfe_i32 v13, v1, 20, 4 146; GFX9-NEXT: v_bfe_i32 v14, v2, 20, 4 147; GFX9-NEXT: v_bfe_i32 v15, v1, 24, 4 148; GFX9-NEXT: v_bfe_i32 v16, v2, 24, 4 149; GFX9-NEXT: v_ashrrev_i32_e32 v1, 28, v1 150; GFX9-NEXT: v_ashrrev_i32_e32 v2, 28, v2 151; GFX9-NEXT: v_mul_i32_i24_e32 v3, v3, v4 152; GFX9-NEXT: v_mul_i32_i24_e32 v4, v5, v6 153; GFX9-NEXT: v_mul_i32_i24_e32 v5, v7, v8 154; GFX9-NEXT: v_mul_i32_i24_e32 v6, v9, v10 155; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2 156; GFX9-NEXT: s_waitcnt lgkmcnt(0) 157; GFX9-NEXT: v_add3_u32 v2, v3, s0, v4 158; GFX9-NEXT: v_mul_i32_i24_e32 v7, v11, v12 159; GFX9-NEXT: v_mul_i32_i24_e32 v8, v13, v14 160; GFX9-NEXT: v_add3_u32 v2, v2, v5, v6 161; GFX9-NEXT: v_mul_i32_i24_e32 v9, v15, v16 162; GFX9-NEXT: v_add3_u32 v2, v2, v7, v8 163; GFX9-NEXT: v_add3_u32 v1, v2, v9, v1 164; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 165; GFX9-NEXT: s_endpgm 166; 167; GFX9-DL-LABEL: idot8_acc32: 168; GFX9-DL: ; %bb.0: ; %entry 169; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 170; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 171; GFX9-DL-NEXT: s_mov_b32 s10, -1 172; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 173; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 174; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 175; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 176; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 177; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 178; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 179; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 180; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 181; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 182; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 183; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 184; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0 185; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 186; GFX9-DL-NEXT: s_endpgm 187; 188; GFX10-DL-XNACK-LABEL: idot8_acc32: 189; GFX10-DL-XNACK: ; %bb.0: ; %entry 190; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 191; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 192; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 193; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 194; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 195; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 196; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 197; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 198; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 199; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) 200; GFX10-DL-XNACK-NEXT: s_clause 0x1 201; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] 202; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] 203; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 204; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0 205; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 206; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s2 207; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[0:1] 208; GFX10-DL-XNACK-NEXT: s_endpgm 209; 210; GFX10-DL-NOXNACK-LABEL: idot8_acc32: 211; GFX10-DL-NOXNACK: ; %bb.0: ; %entry 212; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 213; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 214; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 215; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 216; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 217; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 218; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 219; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 220; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 221; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 222; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) 223; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 224; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] 225; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] 226; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0 227; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 228; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s2 229; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1] 230; GFX10-DL-NOXNACK-NEXT: s_endpgm 231; GFX10-DL-LABEL: idot8_acc32: 232; GFX10-DL: ; %bb.0: ; %entry 233; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 234; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 235; GFX10-DL-NEXT: s_mov_b32 s10, -1 236; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 237; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 238; GFX10-DL-NEXT: s_clause 0x1 239; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 240; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 241; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 242; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 243; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 244; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 245; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 246; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 247; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 248; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 249; GFX10-DL-NEXT: v_dot8_i32_i4 v0, s0, s1, v0 250; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 251; GFX10-DL-NEXT: s_endpgm 252 <8 x i4> addrspace(1)* %src2, 253 i32 addrspace(1)* nocapture %dst) { 254entry: 255 %idx = call i32 @llvm.amdgcn.workitem.id.x() 256 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx 257 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1 258 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx 259 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2 260 261 %v1e0 = extractelement <8 x i4> %vec1, i64 0 262 %cv1e0 = sext i4 %v1e0 to i32 263 %v2e0 = extractelement <8 x i4> %vec2, i64 0 264 %cv2e0 = sext i4 %v2e0 to i32 265 %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0 266 267 %v1e1 = extractelement <8 x i4> %vec1, i64 1 268 %cv1e1 = sext i4 %v1e1 to i32 269 %v2e1 = extractelement <8 x i4> %vec2, i64 1 270 %cv2e1 = sext i4 %v2e1 to i32 271 %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1 272 273 %v1e2 = extractelement <8 x i4> %vec1, i64 2 274 %cv1e2 = sext i4 %v1e2 to i32 275 %v2e2 = extractelement <8 x i4> %vec2, i64 2 276 %cv2e2 = sext i4 %v2e2 to i32 277 %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2 278 279 %v1e3 = extractelement <8 x i4> %vec1, i64 3 280 %cv1e3 = sext i4 %v1e3 to i32 281 %v2e3 = extractelement <8 x i4> %vec2, i64 3 282 %cv2e3 = sext i4 %v2e3 to i32 283 %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3 284 285 %v1e4 = extractelement <8 x i4> %vec1, i64 4 286 %cv1e4 = sext i4 %v1e4 to i32 287 %v2e4 = extractelement <8 x i4> %vec2, i64 4 288 %cv2e4 = sext i4 %v2e4 to i32 289 %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4 290 291 %v1e5 = extractelement <8 x i4> %vec1, i64 5 292 %cv1e5 = sext i4 %v1e5 to i32 293 %v2e5 = extractelement <8 x i4> %vec2, i64 5 294 %cv2e5 = sext i4 %v2e5 to i32 295 %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5 296 297 %v1e6 = extractelement <8 x i4> %vec1, i64 6 298 %cv1e6 = sext i4 %v1e6 to i32 299 %v2e6 = extractelement <8 x i4> %vec2, i64 6 300 %cv2e6 = sext i4 %v2e6 to i32 301 %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6 302 303 %v1e7 = extractelement <8 x i4> %vec1, i64 7 304 %cv1e7 = sext i4 %v1e7 to i32 305 %v2e7 = extractelement <8 x i4> %vec2, i64 7 306 %cv2e7 = sext i4 %v2e7 to i32 307 %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7 308 309 %acc = load i32, i32 addrspace(1)* %dst, align 4 310 %add1 = add i32 %mul0, %acc 311 %add2 = add i32 %add1, %mul1 312 %add3 = add i32 %add2, %mul2 313 %add4 = add i32 %add3, %mul3 314 %add5 = add i32 %add4, %mul4 315 %add6 = add i32 %add5, %mul5 316 %add7 = add i32 %add6, %mul6 317 %add8 = add i32 %add7, %mul7 318 319 store i32 %add8, i32 addrspace(1)* %dst, align 4 320 ret void 321} 322 323; TODO: Once the unnecessary zero extentions of the elements are removed; 324; pattern recognizer will kick in. 325define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, 326; GFX7-LABEL: idot8_acc16: 327; GFX7: ; %bb.0: ; %entry 328; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 329; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 330; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 331; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 332; GFX7-NEXT: s_mov_b32 s14, -1 333; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 334; GFX7-NEXT: s_add_u32 s12, s12, s3 335; GFX7-NEXT: s_mov_b32 s3, 0xf000 336; GFX7-NEXT: s_mov_b32 s10, 0 337; GFX7-NEXT: s_mov_b32 s11, s3 338; GFX7-NEXT: s_waitcnt lgkmcnt(0) 339; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 340; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 341; GFX7-NEXT: v_mov_b32_e32 v1, 0 342; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 343; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 344; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 345; GFX7-NEXT: s_mov_b32 s2, -1 346; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 347; GFX7-NEXT: s_mov_b32 s4, 0xffff 348; GFX7-NEXT: s_addc_u32 s13, s13, 0 349; GFX7-NEXT: s_waitcnt vmcnt(2) 350; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 4 351; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4 352; GFX7-NEXT: s_waitcnt vmcnt(1) 353; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4 354; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 355; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4 356; GFX7-NEXT: v_and_b32_e32 v10, s4, v10 357; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4 358; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 359; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4 360; GFX7-NEXT: v_and_b32_e32 v11, s4, v11 361; GFX7-NEXT: s_waitcnt vmcnt(0) 362; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1 363; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4 364; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 365; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4 366; GFX7-NEXT: v_and_b32_e32 v12, s4, v12 367; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 368; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4 369; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 370; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4 371; GFX7-NEXT: v_and_b32_e32 v13, s4, v13 372; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 373; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4 374; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 375; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4 376; GFX7-NEXT: v_and_b32_e32 v14, s4, v14 377; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 378; GFX7-NEXT: v_bfe_i32 v9, v2, 24, 4 379; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 380; GFX7-NEXT: v_bfe_i32 v16, v0, 24, 4 381; GFX7-NEXT: v_and_b32_e32 v15, s4, v15 382; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 383; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 384; GFX7-NEXT: v_and_b32_e32 v9, s4, v9 385; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 386; GFX7-NEXT: v_and_b32_e32 v16, s4, v16 387; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 388; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 389; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 390; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1 391; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 392; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 393; GFX7-NEXT: s_endpgm 394; 395; GFX8-LABEL: idot8_acc16: 396; GFX8: ; %bb.0: ; %entry 397; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 398; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 399; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 400; GFX8-NEXT: v_mov_b32_e32 v5, 12 401; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 402; GFX8-NEXT: s_waitcnt lgkmcnt(0) 403; GFX8-NEXT: v_mov_b32_e32 v1, s5 404; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 405; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 406; GFX8-NEXT: flat_load_dword v3, v[0:1] 407; GFX8-NEXT: v_mov_b32_e32 v1, s7 408; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 409; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 410; GFX8-NEXT: flat_load_dword v2, v[0:1] 411; GFX8-NEXT: v_mov_b32_e32 v0, s0 412; GFX8-NEXT: v_mov_b32_e32 v1, s1 413; GFX8-NEXT: flat_load_ushort v4, v[0:1] 414; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 415; GFX8-NEXT: s_mov_b32 s10, -1 416; GFX8-NEXT: s_mov_b32 s11, 0xe80000 417; GFX8-NEXT: s_add_u32 s8, s8, s3 418; GFX8-NEXT: s_addc_u32 s9, s9, 0 419; GFX8-NEXT: s_waitcnt vmcnt(2) 420; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 421; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3 422; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 423; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v3 424; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3 425; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 426; GFX8-NEXT: s_waitcnt vmcnt(1) 427; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2 428; GFX8-NEXT: v_lshlrev_b16_e32 v17, 12, v2 429; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v2 430; GFX8-NEXT: v_lshrrev_b32_e32 v12, 20, v2 431; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2 432; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2 433; GFX8-NEXT: v_lshlrev_b16_sdwa v18, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 434; GFX8-NEXT: v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 435; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 436; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 437; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v16 438; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v17 439; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10 440; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15 441; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9 442; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14 443; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 444; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15 445; GFX8-NEXT: s_waitcnt vmcnt(0) 446; GFX8-NEXT: v_mad_u16 v4, v5, v16, v4 447; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8 448; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13 449; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 450; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 451; GFX8-NEXT: v_mad_u16 v4, v10, v15, v4 452; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 453; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 454; GFX8-NEXT: v_mad_u16 v4, v9, v14, v4 455; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v18 456; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v19 457; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7 458; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 459; GFX8-NEXT: v_mad_u16 v4, v8, v13, v4 460; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 461; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 462; GFX8-NEXT: v_mad_u16 v4, v17, v18, v4 463; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 464; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2 465; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6 466; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11 467; GFX8-NEXT: v_mad_u16 v4, v7, v12, v4 468; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 469; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 470; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 471; GFX8-NEXT: v_mad_u16 v2, v6, v11, v2 472; GFX8-NEXT: flat_store_short v[0:1], v2 473; GFX8-NEXT: s_endpgm 474; 475; GFX9-LABEL: idot8_acc16: 476; GFX9: ; %bb.0: ; %entry 477; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 478; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 479; GFX9-NEXT: s_mov_b32 s10, -1 480; GFX9-NEXT: s_mov_b32 s11, 0xe00000 481; GFX9-NEXT: s_add_u32 s8, s8, s3 482; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 483; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 484; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 485; GFX9-NEXT: v_mov_b32_e32 v4, 12 486; GFX9-NEXT: s_waitcnt lgkmcnt(0) 487; GFX9-NEXT: global_load_dword v1, v0, s[4:5] 488; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 489; GFX9-NEXT: v_mov_b32_e32 v0, 0 490; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] 491; GFX9-NEXT: s_addc_u32 s9, s9, 0 492; GFX9-NEXT: s_waitcnt vmcnt(2) 493; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 494; GFX9-NEXT: s_waitcnt vmcnt(1) 495; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2 496; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v1 497; GFX9-NEXT: v_lshlrev_b16_e32 v16, 12, v2 498; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1 499; GFX9-NEXT: v_lshrrev_b32_e32 v6, 20, v1 500; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1 501; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 502; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2 503; GFX9-NEXT: v_lshrrev_b32_e32 v11, 20, v2 504; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2 505; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 506; GFX9-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 507; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 508; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 509; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 510; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v15 511; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v16 512; GFX9-NEXT: v_lshlrev_b16_e32 v9, 12, v9 513; GFX9-NEXT: v_lshlrev_b16_e32 v14, 12, v14 514; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8 515; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13 516; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9 517; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14 518; GFX9-NEXT: s_waitcnt vmcnt(0) 519; GFX9-NEXT: v_mad_legacy_u16 v3, v4, v15, v3 520; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7 521; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12 522; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 523; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13 524; GFX9-NEXT: v_mad_legacy_u16 v3, v9, v14, v3 525; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 526; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 527; GFX9-NEXT: v_mad_legacy_u16 v3, v8, v13, v3 528; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v17 529; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v18 530; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6 531; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11 532; GFX9-NEXT: v_mad_legacy_u16 v3, v7, v12, v3 533; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 534; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 535; GFX9-NEXT: v_mad_legacy_u16 v3, v16, v17, v3 536; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1 537; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 538; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v5 539; GFX9-NEXT: v_lshlrev_b16_e32 v10, 12, v10 540; GFX9-NEXT: v_mad_legacy_u16 v3, v6, v11, v3 541; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 542; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 543; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 544; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 545; GFX9-NEXT: global_store_short v0, v1, s[2:3] 546; GFX9-NEXT: s_endpgm 547; 548; GFX9-DL-LABEL: idot8_acc16: 549; GFX9-DL: ; %bb.0: ; %entry 550; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 551; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 552; GFX9-DL-NEXT: s_mov_b32 s10, -1 553; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 554; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 555; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 556; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 557; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 558; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 559; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 560; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 561; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 562; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 563; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] 564; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 565; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 566; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 567; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 568; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2 569; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v1 570; GFX9-DL-NEXT: v_lshlrev_b16_e32 v16, 12, v2 571; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1 572; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1 573; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1 574; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 575; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2 576; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2 577; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2 578; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 579; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 580; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 581; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 582; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 583; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v15 584; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v16 585; GFX9-DL-NEXT: v_lshlrev_b16_e32 v9, 12, v9 586; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v14 587; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8 588; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13 589; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9 590; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14 591; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 592; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v15, v3 593; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7 594; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12 595; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 596; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13 597; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v9, v14, v3 598; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 599; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 600; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v13, v3 601; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v17 602; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18 603; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6 604; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11 605; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v12, v3 606; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 607; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 608; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v16, v17, v3 609; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1 610; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 611; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v5 612; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v10 613; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v11, v3 614; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 615; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 616; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 617; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 618; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] 619; GFX9-DL-NEXT: s_endpgm 620; 621; GFX10-DL-XNACK-LABEL: idot8_acc16: 622; GFX10-DL-XNACK: ; %bb.0: ; %entry 623; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 624; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 625; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 626; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 627; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 628; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 629; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 630; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 631; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 632; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) 633; GFX10-DL-XNACK-NEXT: s_clause 0x1 634; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] 635; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] 636; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 637; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1] 638; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) 639; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1 640; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1 641; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 20, v1 642; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v1 643; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 644; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 645; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 646; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 647; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) 648; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v2 649; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v2 650; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v2 651; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v2 652; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v2 653; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v2 654; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2 655; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v2 656; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 657; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 658; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 659; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v17, 12, v17 660; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 661; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 662; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 663; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16 664; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) 665; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v1, v17, v3 666; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v9 667; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 668; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 669; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v15 670; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v10, v16, v1 671; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 672; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v14 673; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 674; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 675; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v3, v2, v1 676; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v7 677; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v10 678; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 679; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v13 680; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v9, v1 681; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 682; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v12 683; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 684; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 685; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1 686; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v5 687; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8 688; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4 689; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v11 690; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v6, v7, v1 691; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4 692; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 693; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1 694; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v5, v1 695; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1] 696; GFX10-DL-XNACK-NEXT: s_endpgm 697; 698; GFX10-DL-NOXNACK-LABEL: idot8_acc16: 699; GFX10-DL-NOXNACK: ; %bb.0: ; %entry 700; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 701; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 702; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 703; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 704; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 705; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 706; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 707; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 708; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 709; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 710; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 711; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) 712; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 713; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] 714; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] 715; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1] 716; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) 717; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1 718; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1 719; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 20, v1 720; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v1 721; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 722; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 723; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 724; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 725; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) 726; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v0 727; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v0 728; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v0 729; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v0 730; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v0 731; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v0 732; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0 733; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 8, v0 734; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 735; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 736; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 737; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v17, 12, v17 738; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 739; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 740; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 741; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16 742; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) 743; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v1, v17, v3 744; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v9 745; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 746; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 747; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v15 748; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v10, v16, v1 749; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 750; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v14 751; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 752; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 753; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v3, v0, v1 754; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v7 755; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v10 756; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 757; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v13 758; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v8, v9, v0 759; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 760; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v12 761; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 762; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 763; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0 764; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v5 765; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v8 766; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 767; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v11 768; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v6, v7, v0 769; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4 770; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 771; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0 772; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v4, v5, v0 773; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1] 774; GFX10-DL-NOXNACK-NEXT: s_endpgm 775; GFX10-DL-LABEL: idot8_acc16: 776; GFX10-DL: ; %bb.0: ; %entry 777; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 778; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 779; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 780; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 781; GFX10-DL-NEXT: s_mov_b32 s14, -1 782; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 783; GFX10-DL-NEXT: s_add_u32 s12, s12, s3 784; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 785; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 786; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 787; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] 788; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 789; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 790; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 791; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12 792; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 12 793; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40000 794; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x40000 795; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s2 796; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s3 797; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40004 798; GFX10-DL-NEXT: s_bfe_i32 s9, s0, 0x40008 799; GFX10-DL-NEXT: s_bfe_i32 s10, s1, 0x40008 800; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004 801; GFX10-DL-NEXT: v_mul_i32_i24_e64 v4, s9, s10 802; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2 803; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 804; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010 805; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 806; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1 807; GFX10-DL-NEXT: v_mad_i32_i24 v1, s8, s2, v1 808; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff 809; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2 810; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 811; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 812; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 813; GFX10-DL-NEXT: v_mad_u32_u24 v1, v2, v3, v1 814; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 815; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40014 816; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40014 817; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 818; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 819; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018 820; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 821; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 822; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 823; GFX10-DL-NEXT: v_mad_i32_i24 v1, s0, s1, v1 824; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] 825; GFX10-DL-NEXT: s_endpgm 826 <8 x i4> addrspace(1)* %src2, 827 i16 addrspace(1)* nocapture %dst) { 828entry: 829 %idx = call i32 @llvm.amdgcn.workitem.id.x() 830 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx 831 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1 832 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx 833 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2 834 835 %v1e0 = extractelement <8 x i4> %vec1, i64 0 836 %cv1e0 = sext i4 %v1e0 to i16 837 %v2e0 = extractelement <8 x i4> %vec2, i64 0 838 %cv2e0 = sext i4 %v2e0 to i16 839 %mul0 = mul nuw nsw i16 %cv1e0, %cv2e0 840 841 %v1e1 = extractelement <8 x i4> %vec1, i64 1 842 %cv1e1 = sext i4 %v1e1 to i16 843 %v2e1 = extractelement <8 x i4> %vec2, i64 1 844 %cv2e1 = sext i4 %v2e1 to i16 845 %mul1 = mul nuw nsw i16 %cv1e1, %cv2e1 846 847 %v1e2 = extractelement <8 x i4> %vec1, i64 2 848 %cv1e2 = sext i4 %v1e2 to i16 849 %v2e2 = extractelement <8 x i4> %vec2, i64 2 850 %cv2e2 = sext i4 %v2e2 to i16 851 %mul2 = mul nuw nsw i16 %cv1e2, %cv2e2 852 853 %v1e3 = extractelement <8 x i4> %vec1, i64 3 854 %cv1e3 = sext i4 %v1e3 to i16 855 %v2e3 = extractelement <8 x i4> %vec2, i64 3 856 %cv2e3 = sext i4 %v2e3 to i16 857 %mul3 = mul nuw nsw i16 %cv1e3, %cv2e3 858 859 %v1e4 = extractelement <8 x i4> %vec1, i64 4 860 %cv1e4 = sext i4 %v1e4 to i16 861 %v2e4 = extractelement <8 x i4> %vec2, i64 4 862 %cv2e4 = sext i4 %v2e4 to i16 863 %mul4 = mul nuw nsw i16 %cv1e4, %cv2e4 864 865 %v1e5 = extractelement <8 x i4> %vec1, i64 5 866 %cv1e5 = sext i4 %v1e5 to i16 867 %v2e5 = extractelement <8 x i4> %vec2, i64 5 868 %cv2e5 = sext i4 %v2e5 to i16 869 %mul5 = mul nuw nsw i16 %cv1e5, %cv2e5 870 871 %v1e6 = extractelement <8 x i4> %vec1, i64 6 872 %cv1e6 = sext i4 %v1e6 to i16 873 %v2e6 = extractelement <8 x i4> %vec2, i64 6 874 %cv2e6 = sext i4 %v2e6 to i16 875 %mul6 = mul nuw nsw i16 %cv1e6, %cv2e6 876 877 %v1e7 = extractelement <8 x i4> %vec1, i64 7 878 %cv1e7 = sext i4 %v1e7 to i16 879 %v2e7 = extractelement <8 x i4> %vec2, i64 7 880 %cv2e7 = sext i4 %v2e7 to i16 881 %mul7 = mul nuw nsw i16 %cv1e7, %cv2e7 882 883 %acc = load i16, i16 addrspace(1)* %dst, align 4 884 %add1 = add i16 %mul0, %acc 885 %add2 = add i16 %add1, %mul1 886 %add3 = add i16 %add2, %mul2 887 %add4 = add i16 %add3, %mul3 888 %add5 = add i16 %add4, %mul4 889 %add6 = add i16 %add5, %mul5 890 %add7 = add i16 %add6, %mul6 891 %add8 = add i16 %add7, %mul7 892 893 store i16 %add8, i16 addrspace(1)* %dst, align 4 894 ret void 895} 896 897; TODO: Support this pattern. 898define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, 899; GFX7-LABEL: idot8_acc8: 900; GFX7: ; %bb.0: ; %entry 901; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 902; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 903; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 904; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 905; GFX7-NEXT: s_mov_b32 s14, -1 906; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 907; GFX7-NEXT: s_add_u32 s12, s12, s3 908; GFX7-NEXT: s_mov_b32 s3, 0xf000 909; GFX7-NEXT: s_mov_b32 s10, 0 910; GFX7-NEXT: s_mov_b32 s11, s3 911; GFX7-NEXT: s_waitcnt lgkmcnt(0) 912; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 913; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 914; GFX7-NEXT: v_mov_b32_e32 v1, 0 915; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 916; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 917; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 918; GFX7-NEXT: s_mov_b32 s2, -1 919; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 920; GFX7-NEXT: s_movk_i32 s4, 0xff 921; GFX7-NEXT: s_addc_u32 s13, s13, 0 922; GFX7-NEXT: s_waitcnt vmcnt(2) 923; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 4 924; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4 925; GFX7-NEXT: s_waitcnt vmcnt(1) 926; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4 927; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 928; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4 929; GFX7-NEXT: v_and_b32_e32 v10, s4, v10 930; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4 931; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 932; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4 933; GFX7-NEXT: v_and_b32_e32 v11, s4, v11 934; GFX7-NEXT: s_waitcnt vmcnt(0) 935; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1 936; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4 937; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 938; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4 939; GFX7-NEXT: v_and_b32_e32 v12, s4, v12 940; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 941; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4 942; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 943; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4 944; GFX7-NEXT: v_and_b32_e32 v13, s4, v13 945; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 946; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4 947; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 948; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4 949; GFX7-NEXT: v_and_b32_e32 v14, s4, v14 950; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 951; GFX7-NEXT: v_bfe_i32 v9, v2, 24, 4 952; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 953; GFX7-NEXT: v_bfe_i32 v16, v0, 24, 4 954; GFX7-NEXT: v_and_b32_e32 v15, s4, v15 955; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 956; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 957; GFX7-NEXT: v_and_b32_e32 v9, s4, v9 958; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 959; GFX7-NEXT: v_and_b32_e32 v16, s4, v16 960; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 961; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 962; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 963; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1 964; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 965; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 966; GFX7-NEXT: s_endpgm 967; 968; GFX8-LABEL: idot8_acc8: 969; GFX8: ; %bb.0: ; %entry 970; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 971; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 972; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 973; GFX8-NEXT: v_mov_b32_e32 v5, 12 974; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 975; GFX8-NEXT: s_waitcnt lgkmcnt(0) 976; GFX8-NEXT: v_mov_b32_e32 v1, s5 977; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 978; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 979; GFX8-NEXT: flat_load_dword v3, v[0:1] 980; GFX8-NEXT: v_mov_b32_e32 v1, s7 981; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 982; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 983; GFX8-NEXT: flat_load_dword v2, v[0:1] 984; GFX8-NEXT: v_mov_b32_e32 v0, s0 985; GFX8-NEXT: v_mov_b32_e32 v1, s1 986; GFX8-NEXT: flat_load_ubyte v4, v[0:1] 987; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 988; GFX8-NEXT: s_mov_b32 s10, -1 989; GFX8-NEXT: s_mov_b32 s11, 0xe80000 990; GFX8-NEXT: s_add_u32 s8, s8, s3 991; GFX8-NEXT: s_addc_u32 s9, s9, 0 992; GFX8-NEXT: s_waitcnt vmcnt(2) 993; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 994; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3 995; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 996; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v3 997; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3 998; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 999; GFX8-NEXT: s_waitcnt vmcnt(1) 1000; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2 1001; GFX8-NEXT: v_lshlrev_b16_e32 v17, 12, v2 1002; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v2 1003; GFX8-NEXT: v_lshrrev_b32_e32 v12, 20, v2 1004; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2 1005; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2 1006; GFX8-NEXT: v_lshlrev_b16_sdwa v18, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1007; GFX8-NEXT: v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1008; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 1009; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 1010; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v16 1011; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v17 1012; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10 1013; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15 1014; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9 1015; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14 1016; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 1017; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15 1018; GFX8-NEXT: s_waitcnt vmcnt(0) 1019; GFX8-NEXT: v_mad_u16 v4, v5, v16, v4 1020; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8 1021; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13 1022; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 1023; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 1024; GFX8-NEXT: v_mad_u16 v4, v10, v15, v4 1025; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 1026; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 1027; GFX8-NEXT: v_mad_u16 v4, v9, v14, v4 1028; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v18 1029; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v19 1030; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7 1031; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 1032; GFX8-NEXT: v_mad_u16 v4, v8, v13, v4 1033; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 1034; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 1035; GFX8-NEXT: v_mad_u16 v4, v17, v18, v4 1036; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 1037; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2 1038; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6 1039; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11 1040; GFX8-NEXT: v_mad_u16 v4, v7, v12, v4 1041; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 1042; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 1043; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 1044; GFX8-NEXT: v_mad_u16 v2, v6, v11, v2 1045; GFX8-NEXT: flat_store_byte v[0:1], v2 1046; GFX8-NEXT: s_endpgm 1047; 1048; GFX9-LABEL: idot8_acc8: 1049; GFX9: ; %bb.0: ; %entry 1050; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1051; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1052; GFX9-NEXT: s_mov_b32 s10, -1 1053; GFX9-NEXT: s_mov_b32 s11, 0xe00000 1054; GFX9-NEXT: s_add_u32 s8, s8, s3 1055; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1056; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1057; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1058; GFX9-NEXT: v_mov_b32_e32 v4, 12 1059; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1060; GFX9-NEXT: global_load_dword v1, v0, s[4:5] 1061; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 1062; GFX9-NEXT: v_mov_b32_e32 v0, 0 1063; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] 1064; GFX9-NEXT: s_addc_u32 s9, s9, 0 1065; GFX9-NEXT: s_waitcnt vmcnt(2) 1066; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 1067; GFX9-NEXT: s_waitcnt vmcnt(1) 1068; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2 1069; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v1 1070; GFX9-NEXT: v_lshlrev_b16_e32 v16, 12, v2 1071; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1 1072; GFX9-NEXT: v_lshrrev_b32_e32 v6, 20, v1 1073; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1 1074; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 1075; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2 1076; GFX9-NEXT: v_lshrrev_b32_e32 v11, 20, v2 1077; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2 1078; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 1079; GFX9-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1080; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1081; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 1082; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 1083; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v15 1084; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v16 1085; GFX9-NEXT: v_lshlrev_b16_e32 v9, 12, v9 1086; GFX9-NEXT: v_lshlrev_b16_e32 v14, 12, v14 1087; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8 1088; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13 1089; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9 1090; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14 1091; GFX9-NEXT: s_waitcnt vmcnt(0) 1092; GFX9-NEXT: v_mad_legacy_u16 v3, v4, v15, v3 1093; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7 1094; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12 1095; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 1096; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13 1097; GFX9-NEXT: v_mad_legacy_u16 v3, v9, v14, v3 1098; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 1099; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 1100; GFX9-NEXT: v_mad_legacy_u16 v3, v8, v13, v3 1101; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v17 1102; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v18 1103; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6 1104; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11 1105; GFX9-NEXT: v_mad_legacy_u16 v3, v7, v12, v3 1106; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 1107; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 1108; GFX9-NEXT: v_mad_legacy_u16 v3, v16, v17, v3 1109; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1 1110; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 1111; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v5 1112; GFX9-NEXT: v_lshlrev_b16_e32 v10, 12, v10 1113; GFX9-NEXT: v_mad_legacy_u16 v3, v6, v11, v3 1114; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 1115; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 1116; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 1117; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 1118; GFX9-NEXT: global_store_byte v0, v1, s[2:3] 1119; GFX9-NEXT: s_endpgm 1120; 1121; GFX9-DL-LABEL: idot8_acc8: 1122; GFX9-DL: ; %bb.0: ; %entry 1123; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1124; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1125; GFX9-DL-NEXT: s_mov_b32 s10, -1 1126; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 1127; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 1128; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1129; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1130; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1131; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 1132; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1133; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 1134; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 1135; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1136; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] 1137; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 1138; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 1139; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 1140; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 1141; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2 1142; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v1 1143; GFX9-DL-NEXT: v_lshlrev_b16_e32 v16, 12, v2 1144; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1 1145; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1 1146; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1 1147; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 1148; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2 1149; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2 1150; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2 1151; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 1152; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1153; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1154; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 1155; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 1156; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v15 1157; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v16 1158; GFX9-DL-NEXT: v_lshlrev_b16_e32 v9, 12, v9 1159; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v14 1160; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8 1161; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13 1162; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9 1163; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14 1164; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1165; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v15, v3 1166; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7 1167; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12 1168; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 1169; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13 1170; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v9, v14, v3 1171; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 1172; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 1173; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v13, v3 1174; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v17 1175; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18 1176; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6 1177; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11 1178; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v12, v3 1179; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 1180; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 1181; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v16, v17, v3 1182; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1 1183; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 1184; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v5 1185; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v10 1186; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v11, v3 1187; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 1188; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 1189; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 1190; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 1191; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] 1192; GFX9-DL-NEXT: s_endpgm 1193; 1194; GFX10-DL-XNACK-LABEL: idot8_acc8: 1195; GFX10-DL-XNACK: ; %bb.0: ; %entry 1196; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1197; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1198; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1199; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1200; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1201; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 1202; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 1203; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 1204; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 1205; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) 1206; GFX10-DL-XNACK-NEXT: s_clause 0x1 1207; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] 1208; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] 1209; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 1210; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v0, s[0:1] 1211; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) 1212; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1 1213; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1 1214; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 20, v1 1215; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v1 1216; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 1217; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 1218; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 1219; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 1220; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) 1221; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v2 1222; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v2 1223; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v2 1224; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v2 1225; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v2 1226; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v2 1227; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2 1228; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v2 1229; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 1230; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 1231; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 1232; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v17, 12, v17 1233; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 1234; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 1235; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 1236; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16 1237; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) 1238; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v1, v17, v3 1239; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v9 1240; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 1241; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 1242; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v15 1243; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v10, v16, v1 1244; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 1245; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v14 1246; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 1247; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 1248; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v3, v2, v1 1249; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v7 1250; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v10 1251; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 1252; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v13 1253; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v9, v1 1254; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 1255; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v12 1256; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 1257; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 1258; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1 1259; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v5 1260; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8 1261; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4 1262; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v11 1263; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v6, v7, v1 1264; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4 1265; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 1266; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1 1267; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v5, v1 1268; GFX10-DL-XNACK-NEXT: global_store_byte v0, v1, s[0:1] 1269; GFX10-DL-XNACK-NEXT: s_endpgm 1270; 1271; GFX10-DL-NOXNACK-LABEL: idot8_acc8: 1272; GFX10-DL-NOXNACK: ; %bb.0: ; %entry 1273; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 1274; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1275; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1276; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1277; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 1278; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1279; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1280; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 1281; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 1282; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 1283; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 1284; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) 1285; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 1286; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] 1287; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] 1288; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v3, v2, s[0:1] 1289; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) 1290; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1 1291; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1 1292; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 20, v1 1293; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v1 1294; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 1295; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 1296; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 1297; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 1298; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) 1299; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v0 1300; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v0 1301; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v0 1302; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v0 1303; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v0 1304; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v0 1305; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0 1306; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 8, v0 1307; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 1308; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 1309; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 1310; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v17, 12, v17 1311; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 1312; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 1313; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 1314; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16 1315; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) 1316; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v1, v17, v3 1317; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v9 1318; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 1319; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 1320; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v15 1321; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v10, v16, v1 1322; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 1323; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v14 1324; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 1325; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 1326; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v3, v0, v1 1327; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v7 1328; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v10 1329; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 1330; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v13 1331; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v8, v9, v0 1332; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 1333; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v12 1334; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 1335; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 1336; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0 1337; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v5 1338; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v8 1339; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 1340; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v11 1341; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v6, v7, v0 1342; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4 1343; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 1344; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0 1345; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v4, v5, v0 1346; GFX10-DL-NOXNACK-NEXT: global_store_byte v2, v0, s[0:1] 1347; GFX10-DL-NOXNACK-NEXT: s_endpgm 1348; GFX10-DL-LABEL: idot8_acc8: 1349; GFX10-DL: ; %bb.0: ; %entry 1350; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 1351; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 1352; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1353; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1354; GFX10-DL-NEXT: s_mov_b32 s14, -1 1355; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 1356; GFX10-DL-NEXT: s_add_u32 s12, s12, s3 1357; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1358; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 1359; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1360; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] 1361; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 1362; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 1363; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1364; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12 1365; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 12 1366; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40000 1367; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x40000 1368; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s2 1369; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s3 1370; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40004 1371; GFX10-DL-NEXT: s_bfe_i32 s9, s0, 0x40008 1372; GFX10-DL-NEXT: s_bfe_i32 s10, s1, 0x40008 1373; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004 1374; GFX10-DL-NEXT: v_mul_i32_i24_e64 v4, s9, s10 1375; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2 1376; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 1377; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010 1378; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1379; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1 1380; GFX10-DL-NEXT: v_mad_i32_i24 v1, s8, s2, v1 1381; GFX10-DL-NEXT: s_movk_i32 s2, 0xff 1382; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2 1383; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 1384; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 1385; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 1386; GFX10-DL-NEXT: v_mad_u32_u24 v1, v2, v3, v1 1387; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 1388; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40014 1389; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40014 1390; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 1391; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 1392; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018 1393; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 1394; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 1395; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 1396; GFX10-DL-NEXT: v_mad_i32_i24 v1, s0, s1, v1 1397; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] 1398; GFX10-DL-NEXT: s_endpgm 1399 <8 x i4> addrspace(1)* %src2, 1400 i8 addrspace(1)* nocapture %dst) { 1401entry: 1402 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1403 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx 1404 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1 1405 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx 1406 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2 1407 1408 %v1e0 = extractelement <8 x i4> %vec1, i64 0 1409 %cv1e0 = sext i4 %v1e0 to i8 1410 %v2e0 = extractelement <8 x i4> %vec2, i64 0 1411 %cv2e0 = sext i4 %v2e0 to i8 1412 %mul0 = mul nuw nsw i8 %cv1e0, %cv2e0 1413 1414 %v1e1 = extractelement <8 x i4> %vec1, i64 1 1415 %cv1e1 = sext i4 %v1e1 to i8 1416 %v2e1 = extractelement <8 x i4> %vec2, i64 1 1417 %cv2e1 = sext i4 %v2e1 to i8 1418 %mul1 = mul nuw nsw i8 %cv1e1, %cv2e1 1419 1420 %v1e2 = extractelement <8 x i4> %vec1, i64 2 1421 %cv1e2 = sext i4 %v1e2 to i8 1422 %v2e2 = extractelement <8 x i4> %vec2, i64 2 1423 %cv2e2 = sext i4 %v2e2 to i8 1424 %mul2 = mul nuw nsw i8 %cv1e2, %cv2e2 1425 1426 %v1e3 = extractelement <8 x i4> %vec1, i64 3 1427 %cv1e3 = sext i4 %v1e3 to i8 1428 %v2e3 = extractelement <8 x i4> %vec2, i64 3 1429 %cv2e3 = sext i4 %v2e3 to i8 1430 %mul3 = mul nuw nsw i8 %cv1e3, %cv2e3 1431 1432 %v1e4 = extractelement <8 x i4> %vec1, i64 4 1433 %cv1e4 = sext i4 %v1e4 to i8 1434 %v2e4 = extractelement <8 x i4> %vec2, i64 4 1435 %cv2e4 = sext i4 %v2e4 to i8 1436 %mul4 = mul nuw nsw i8 %cv1e4, %cv2e4 1437 1438 %v1e5 = extractelement <8 x i4> %vec1, i64 5 1439 %cv1e5 = sext i4 %v1e5 to i8 1440 %v2e5 = extractelement <8 x i4> %vec2, i64 5 1441 %cv2e5 = sext i4 %v2e5 to i8 1442 %mul5 = mul nuw nsw i8 %cv1e5, %cv2e5 1443 1444 %v1e6 = extractelement <8 x i4> %vec1, i64 6 1445 %cv1e6 = sext i4 %v1e6 to i8 1446 %v2e6 = extractelement <8 x i4> %vec2, i64 6 1447 %cv2e6 = sext i4 %v2e6 to i8 1448 %mul6 = mul nuw nsw i8 %cv1e6, %cv2e6 1449 1450 %v1e7 = extractelement <8 x i4> %vec1, i64 7 1451 %cv1e7 = sext i4 %v1e7 to i8 1452 %v2e7 = extractelement <8 x i4> %vec2, i64 7 1453 %cv2e7 = sext i4 %v2e7 to i8 1454 %mul7 = mul nuw nsw i8 %cv1e7, %cv2e7 1455 1456 %acc = load i8, i8 addrspace(1)* %dst, align 4 1457 %add1 = add i8 %mul0, %acc 1458 %add2 = add i8 %add1, %mul1 1459 %add3 = add i8 %add2, %mul2 1460 %add4 = add i8 %add3, %mul3 1461 %add5 = add i8 %add4, %mul4 1462 %add6 = add i8 %add5, %mul5 1463 %add7 = add i8 %add6, %mul6 1464 %add8 = add i8 %add7, %mul7 1465 1466 store i8 %add8, i8 addrspace(1)* %dst, align 4 1467 ret void 1468} 1469 1470; Make sure the pattern is not recognized if there are multiple uses of the 1471; intermediate multiplications. 1472define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, 1473; GFX7-LABEL: idot8_multiuses_mul1: 1474; GFX7: ; %bb.0: ; %entry 1475; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1476; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1477; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1478; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1479; GFX7-NEXT: s_mov_b32 s14, -1 1480; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 1481; GFX7-NEXT: s_add_u32 s12, s12, s3 1482; GFX7-NEXT: s_mov_b32 s3, 0xf000 1483; GFX7-NEXT: s_mov_b32 s10, 0 1484; GFX7-NEXT: s_mov_b32 s11, s3 1485; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1486; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 1487; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1488; GFX7-NEXT: v_mov_b32_e32 v1, 0 1489; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 1490; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 1491; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1492; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 1493; GFX7-NEXT: s_mov_b32 s2, -1 1494; GFX7-NEXT: s_addc_u32 s13, s13, 0 1495; GFX7-NEXT: s_waitcnt vmcnt(1) 1496; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4 1497; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4 1498; GFX7-NEXT: s_waitcnt vmcnt(0) 1499; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4 1500; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1501; GFX7-NEXT: v_mad_i32_i24 v16, v1, v9, s4 1502; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4 1503; GFX7-NEXT: v_mad_i32_i24 v1, v1, v9, v16 1504; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4 1505; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4 1506; GFX7-NEXT: v_mad_i32_i24 v1, v3, v10, v1 1507; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4 1508; GFX7-NEXT: v_bfe_i32 v12, v0, 12, 4 1509; GFX7-NEXT: v_mad_i32_i24 v1, v4, v11, v1 1510; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4 1511; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4 1512; GFX7-NEXT: v_mad_i32_i24 v1, v5, v12, v1 1513; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4 1514; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4 1515; GFX7-NEXT: v_mad_i32_i24 v1, v6, v13, v1 1516; GFX7-NEXT: v_bfe_i32 v8, v2, 24, 4 1517; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4 1518; GFX7-NEXT: v_mad_i32_i24 v1, v7, v14, v1 1519; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 1520; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 1521; GFX7-NEXT: v_mad_i32_i24 v1, v8, v15, v1 1522; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1 1523; GFX7-NEXT: v_add_i32_e32 v0, vcc, v16, v0 1524; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1525; GFX7-NEXT: s_endpgm 1526; 1527; GFX8-LABEL: idot8_multiuses_mul1: 1528; GFX8: ; %bb.0: ; %entry 1529; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1530; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1531; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1532; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1533; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1534; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1535; GFX8-NEXT: v_mov_b32_e32 v1, s5 1536; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1537; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1538; GFX8-NEXT: flat_load_dword v3, v[0:1] 1539; GFX8-NEXT: v_mov_b32_e32 v1, s7 1540; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 1541; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1542; GFX8-NEXT: flat_load_dword v0, v[0:1] 1543; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 1544; GFX8-NEXT: s_mov_b32 s10, -1 1545; GFX8-NEXT: s_mov_b32 s11, 0xe80000 1546; GFX8-NEXT: s_add_u32 s8, s8, s3 1547; GFX8-NEXT: s_addc_u32 s9, s9, 0 1548; GFX8-NEXT: s_waitcnt vmcnt(1) 1549; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4 1550; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4 1551; GFX8-NEXT: v_bfe_i32 v6, v3, 8, 4 1552; GFX8-NEXT: v_bfe_i32 v8, v3, 12, 4 1553; GFX8-NEXT: v_bfe_i32 v10, v3, 16, 4 1554; GFX8-NEXT: v_bfe_i32 v12, v3, 20, 4 1555; GFX8-NEXT: s_waitcnt vmcnt(0) 1556; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 4 1557; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1558; GFX8-NEXT: v_mad_i32_i24 v16, v1, v2, s2 1559; GFX8-NEXT: v_bfe_i32 v5, v0, 4, 4 1560; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, v16 1561; GFX8-NEXT: v_bfe_i32 v7, v0, 8, 4 1562; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1 1563; GFX8-NEXT: v_bfe_i32 v9, v0, 12, 4 1564; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1 1565; GFX8-NEXT: v_bfe_i32 v11, v0, 16, 4 1566; GFX8-NEXT: v_mad_i32_i24 v1, v8, v9, v1 1567; GFX8-NEXT: v_bfe_i32 v13, v0, 20, 4 1568; GFX8-NEXT: v_mad_i32_i24 v1, v10, v11, v1 1569; GFX8-NEXT: v_bfe_i32 v14, v3, 24, 4 1570; GFX8-NEXT: v_bfe_i32 v15, v0, 24, 4 1571; GFX8-NEXT: v_mad_i32_i24 v1, v12, v13, v1 1572; GFX8-NEXT: v_ashrrev_i32_e32 v3, 28, v3 1573; GFX8-NEXT: v_ashrrev_i32_e32 v0, 28, v0 1574; GFX8-NEXT: v_mad_i32_i24 v1, v14, v15, v1 1575; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, v1 1576; GFX8-NEXT: v_add_u32_e32 v2, vcc, v16, v0 1577; GFX8-NEXT: v_mov_b32_e32 v0, s0 1578; GFX8-NEXT: v_mov_b32_e32 v1, s1 1579; GFX8-NEXT: flat_store_dword v[0:1], v2 1580; GFX8-NEXT: s_endpgm 1581; 1582; GFX9-LABEL: idot8_multiuses_mul1: 1583; GFX9: ; %bb.0: ; %entry 1584; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1585; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1586; GFX9-NEXT: s_mov_b32 s10, -1 1587; GFX9-NEXT: s_mov_b32 s11, 0xe00000 1588; GFX9-NEXT: s_add_u32 s8, s8, s3 1589; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1590; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1591; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1592; GFX9-NEXT: s_addc_u32 s9, s9, 0 1593; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1594; GFX9-NEXT: global_load_dword v1, v0, s[4:5] 1595; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 1596; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 1597; GFX9-NEXT: v_mov_b32_e32 v0, 0 1598; GFX9-NEXT: s_waitcnt vmcnt(1) 1599; GFX9-NEXT: v_bfe_i32 v3, v1, 0, 4 1600; GFX9-NEXT: s_waitcnt vmcnt(0) 1601; GFX9-NEXT: v_bfe_i32 v4, v2, 0, 4 1602; GFX9-NEXT: v_bfe_i32 v5, v1, 4, 4 1603; GFX9-NEXT: v_bfe_i32 v6, v2, 4, 4 1604; GFX9-NEXT: v_bfe_i32 v7, v1, 8, 4 1605; GFX9-NEXT: v_bfe_i32 v8, v2, 8, 4 1606; GFX9-NEXT: v_bfe_i32 v9, v1, 12, 4 1607; GFX9-NEXT: v_bfe_i32 v10, v2, 12, 4 1608; GFX9-NEXT: v_bfe_i32 v11, v1, 16, 4 1609; GFX9-NEXT: v_bfe_i32 v12, v2, 16, 4 1610; GFX9-NEXT: v_bfe_i32 v13, v1, 20, 4 1611; GFX9-NEXT: v_bfe_i32 v14, v2, 20, 4 1612; GFX9-NEXT: v_bfe_i32 v15, v1, 24, 4 1613; GFX9-NEXT: v_bfe_i32 v16, v2, 24, 4 1614; GFX9-NEXT: v_ashrrev_i32_e32 v1, 28, v1 1615; GFX9-NEXT: v_ashrrev_i32_e32 v2, 28, v2 1616; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2 1617; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1618; GFX9-NEXT: v_mad_i32_i24 v2, v3, v4, s0 1619; GFX9-NEXT: v_mul_i32_i24_e32 v5, v5, v6 1620; GFX9-NEXT: v_mul_i32_i24_e32 v6, v7, v8 1621; GFX9-NEXT: v_mad_i32_i24 v3, v3, v4, v2 1622; GFX9-NEXT: v_mul_i32_i24_e32 v7, v9, v10 1623; GFX9-NEXT: v_mul_i32_i24_e32 v8, v11, v12 1624; GFX9-NEXT: v_add3_u32 v3, v3, v5, v6 1625; GFX9-NEXT: v_mul_i32_i24_e32 v9, v13, v14 1626; GFX9-NEXT: v_mul_i32_i24_e32 v10, v15, v16 1627; GFX9-NEXT: v_add3_u32 v3, v3, v7, v8 1628; GFX9-NEXT: v_add3_u32 v3, v3, v9, v10 1629; GFX9-NEXT: v_add3_u32 v1, v3, v1, v2 1630; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 1631; GFX9-NEXT: s_endpgm 1632; 1633; GFX9-DL-LABEL: idot8_multiuses_mul1: 1634; GFX9-DL: ; %bb.0: ; %entry 1635; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1636; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1637; GFX9-DL-NEXT: s_mov_b32 s10, -1 1638; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 1639; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 1640; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1641; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1642; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1643; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 1644; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1645; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 1646; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 1647; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 1648; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1649; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 1650; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 4 1651; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1652; GFX9-DL-NEXT: v_bfe_i32 v4, v2, 0, 4 1653; GFX9-DL-NEXT: v_bfe_i32 v5, v1, 4, 4 1654; GFX9-DL-NEXT: v_bfe_i32 v6, v2, 4, 4 1655; GFX9-DL-NEXT: v_bfe_i32 v7, v1, 8, 4 1656; GFX9-DL-NEXT: v_bfe_i32 v8, v2, 8, 4 1657; GFX9-DL-NEXT: v_bfe_i32 v9, v1, 12, 4 1658; GFX9-DL-NEXT: v_bfe_i32 v10, v2, 12, 4 1659; GFX9-DL-NEXT: v_bfe_i32 v11, v1, 16, 4 1660; GFX9-DL-NEXT: v_bfe_i32 v12, v2, 16, 4 1661; GFX9-DL-NEXT: v_bfe_i32 v13, v1, 20, 4 1662; GFX9-DL-NEXT: v_bfe_i32 v14, v2, 20, 4 1663; GFX9-DL-NEXT: v_bfe_i32 v15, v1, 24, 4 1664; GFX9-DL-NEXT: v_bfe_i32 v16, v2, 24, 4 1665; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 28, v1 1666; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 28, v2 1667; GFX9-DL-NEXT: v_mul_i32_i24_e32 v1, v1, v2 1668; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1669; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v4, s0 1670; GFX9-DL-NEXT: v_mul_i32_i24_e32 v5, v5, v6 1671; GFX9-DL-NEXT: v_mul_i32_i24_e32 v6, v7, v8 1672; GFX9-DL-NEXT: v_mad_i32_i24 v3, v3, v4, v2 1673; GFX9-DL-NEXT: v_mul_i32_i24_e32 v7, v9, v10 1674; GFX9-DL-NEXT: v_mul_i32_i24_e32 v8, v11, v12 1675; GFX9-DL-NEXT: v_add3_u32 v3, v3, v5, v6 1676; GFX9-DL-NEXT: v_mul_i32_i24_e32 v9, v13, v14 1677; GFX9-DL-NEXT: v_mul_i32_i24_e32 v10, v15, v16 1678; GFX9-DL-NEXT: v_add3_u32 v3, v3, v7, v8 1679; GFX9-DL-NEXT: v_add3_u32 v3, v3, v9, v10 1680; GFX9-DL-NEXT: v_add3_u32 v1, v3, v1, v2 1681; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 1682; GFX9-DL-NEXT: s_endpgm 1683; 1684; GFX10-DL-XNACK-LABEL: idot8_multiuses_mul1: 1685; GFX10-DL-XNACK: ; %bb.0: ; %entry 1686; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1687; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1688; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1689; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1690; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1691; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 1692; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 1693; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 1694; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 1695; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) 1696; GFX10-DL-XNACK-NEXT: s_clause 0x1 1697; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] 1698; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] 1699; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0 1700; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) 1701; GFX10-DL-XNACK-NEXT: v_bfe_i32 v0, v1, 0, 4 1702; GFX10-DL-XNACK-NEXT: v_bfe_i32 v3, v1, 4, 4 1703; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) 1704; GFX10-DL-XNACK-NEXT: v_bfe_i32 v4, v2, 4, 4 1705; GFX10-DL-XNACK-NEXT: v_bfe_i32 v5, v1, 8, 4 1706; GFX10-DL-XNACK-NEXT: v_bfe_i32 v6, v2, 8, 4 1707; GFX10-DL-XNACK-NEXT: v_bfe_i32 v7, v2, 0, 4 1708; GFX10-DL-XNACK-NEXT: v_bfe_i32 v8, v1, 12, 4 1709; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v3, v3, v4 1710; GFX10-DL-XNACK-NEXT: v_bfe_i32 v9, v2, 12, 4 1711; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6 1712; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) 1713; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v5, v0, v7, s2 1714; GFX10-DL-XNACK-NEXT: v_bfe_i32 v6, v1, 16, 4 1715; GFX10-DL-XNACK-NEXT: v_bfe_i32 v10, v2, 16, 4 1716; GFX10-DL-XNACK-NEXT: v_bfe_i32 v11, v1, 20, 4 1717; GFX10-DL-XNACK-NEXT: v_bfe_i32 v12, v2, 20, 4 1718; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v0, v0, v7, v5 1719; GFX10-DL-XNACK-NEXT: v_bfe_i32 v7, v1, 24, 4 1720; GFX10-DL-XNACK-NEXT: v_bfe_i32 v13, v2, 24, 4 1721; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v8, v8, v9 1722; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v6, v6, v10 1723; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v3, v4 1724; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v3, v11, v12 1725; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v7, v13 1726; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v1, 28, v1 1727; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v2, 28, v2 1728; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v8, v6 1729; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v1, v1, v2 1730; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v3, v4 1731; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v2, 0 1732; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v1, v5 1733; GFX10-DL-XNACK-NEXT: global_store_dword v2, v0, s[0:1] 1734; GFX10-DL-XNACK-NEXT: s_endpgm 1735; 1736; GFX10-DL-NOXNACK-LABEL: idot8_multiuses_mul1: 1737; GFX10-DL-NOXNACK: ; %bb.0: ; %entry 1738; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1739; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1740; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1741; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1742; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1743; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 1744; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 1745; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 1746; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 1747; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) 1748; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 1749; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] 1750; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] 1751; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0 1752; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) 1753; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v2, v1, 0, 4 1754; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v3, v1, 4, 4 1755; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) 1756; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v4, v0, 4, 4 1757; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v5, v1, 8, 4 1758; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v6, v0, 8, 4 1759; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v7, v0, 0, 4 1760; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v8, v1, 12, 4 1761; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v3, v3, v4 1762; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v9, v0, 12, 4 1763; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6 1764; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) 1765; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v5, v2, v7, s2 1766; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v6, v1, 16, 4 1767; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v10, v0, 16, 4 1768; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v11, v1, 20, 4 1769; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v12, v0, 20, 4 1770; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v2, v2, v7, v5 1771; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v7, v1, 24, 4 1772; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v13, v0, 24, 4 1773; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v8, v8, v9 1774; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v6, v6, v10 1775; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v2, v2, v3, v4 1776; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v3, v11, v12 1777; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v7, v13 1778; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v1, 28, v1 1779; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v0, 28, v0 1780; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v2, v2, v8, v6 1781; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v0, v1, v0 1782; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v1, v2, v3, v4 1783; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 1784; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v0, v1, v0, v5 1785; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1] 1786; GFX10-DL-NOXNACK-NEXT: s_endpgm 1787; GFX10-DL-LABEL: idot8_multiuses_mul1: 1788; GFX10-DL: ; %bb.0: ; %entry 1789; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1790; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1791; GFX10-DL-NEXT: s_mov_b32 s10, -1 1792; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 1793; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 1794; GFX10-DL-NEXT: s_clause 0x1 1795; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 1796; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1797; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 1798; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 1799; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1800; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 1801; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 1802; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 1803; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1804; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 1805; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40000 1806; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40000 1807; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0 1808; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v0 1809; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40004 1810; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40004 1811; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 1812; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40008 1813; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40008 1814; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 1815; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x4000c 1816; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x4000c 1817; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 1818; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 1819; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010 1820; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 1821; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40014 1822; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40014 1823; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 1824; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 1825; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018 1826; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 1827; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 1828; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 1829; GFX10-DL-NEXT: v_mad_i32_i24 v1, s0, s1, v1 1830; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v0, v1 1831; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5] 1832; GFX10-DL-NEXT: s_endpgm 1833 <8 x i4> addrspace(1)* %src2, 1834 i32 addrspace(1)* nocapture %dst) { 1835entry: 1836 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1837 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx 1838 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1 1839 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx 1840 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2 1841 1842 %v1e0 = extractelement <8 x i4> %vec1, i64 0 1843 %cv1e0 = sext i4 %v1e0 to i32 1844 %v2e0 = extractelement <8 x i4> %vec2, i64 0 1845 %cv2e0 = sext i4 %v2e0 to i32 1846 %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0 1847 1848 %v1e1 = extractelement <8 x i4> %vec1, i64 1 1849 %cv1e1 = sext i4 %v1e1 to i32 1850 %v2e1 = extractelement <8 x i4> %vec2, i64 1 1851 %cv2e1 = sext i4 %v2e1 to i32 1852 %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1 1853 1854 %v1e2 = extractelement <8 x i4> %vec1, i64 2 1855 %cv1e2 = sext i4 %v1e2 to i32 1856 %v2e2 = extractelement <8 x i4> %vec2, i64 2 1857 %cv2e2 = sext i4 %v2e2 to i32 1858 %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2 1859 1860 %v1e3 = extractelement <8 x i4> %vec1, i64 3 1861 %cv1e3 = sext i4 %v1e3 to i32 1862 %v2e3 = extractelement <8 x i4> %vec2, i64 3 1863 %cv2e3 = sext i4 %v2e3 to i32 1864 %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3 1865 1866 %v1e4 = extractelement <8 x i4> %vec1, i64 4 1867 %cv1e4 = sext i4 %v1e4 to i32 1868 %v2e4 = extractelement <8 x i4> %vec2, i64 4 1869 %cv2e4 = sext i4 %v2e4 to i32 1870 %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4 1871 1872 %v1e5 = extractelement <8 x i4> %vec1, i64 5 1873 %cv1e5 = sext i4 %v1e5 to i32 1874 %v2e5 = extractelement <8 x i4> %vec2, i64 5 1875 %cv2e5 = sext i4 %v2e5 to i32 1876 %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5 1877 1878 %v1e6 = extractelement <8 x i4> %vec1, i64 6 1879 %cv1e6 = sext i4 %v1e6 to i32 1880 %v2e6 = extractelement <8 x i4> %vec2, i64 6 1881 %cv2e6 = sext i4 %v2e6 to i32 1882 %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6 1883 1884 %v1e7 = extractelement <8 x i4> %vec1, i64 7 1885 %cv1e7 = sext i4 %v1e7 to i32 1886 %v2e7 = extractelement <8 x i4> %vec2, i64 7 1887 %cv2e7 = sext i4 %v2e7 to i32 1888 %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7 1889 1890 %acc = load i32, i32 addrspace(1)* %dst, align 4 1891 %add = add i32 %mul0, %acc 1892 %add1 = add i32 %mul0, %add 1893 %add2 = add i32 %add1, %mul1 1894 %add3 = add i32 %add2, %mul2 1895 %add4 = add i32 %add3, %mul3 1896 %add5 = add i32 %add4, %mul4 1897 %add6 = add i32 %add5, %mul5 1898 %add7 = add i32 %add6, %mul6 1899 %add8 = add i32 %add7, %mul7 1900 1901 %res = add i32 %add, %add8 1902 store i32 %res, i32 addrspace(1)* %dst, align 4 1903 ret void 1904} 1905 1906; TODO: Support this pattern. 1907define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, 1908; GFX7-LABEL: idot8_acc32_vecMul: 1909; GFX7: ; %bb.0: ; %entry 1910; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1911; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1912; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1913; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1914; GFX7-NEXT: s_mov_b32 s14, -1 1915; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 1916; GFX7-NEXT: s_add_u32 s12, s12, s3 1917; GFX7-NEXT: s_mov_b32 s3, 0xf000 1918; GFX7-NEXT: s_mov_b32 s10, 0 1919; GFX7-NEXT: s_mov_b32 s11, s3 1920; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1921; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 1922; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1923; GFX7-NEXT: v_mov_b32_e32 v1, 0 1924; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 1925; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 1926; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1927; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 1928; GFX7-NEXT: s_mov_b32 s2, -1 1929; GFX7-NEXT: s_addc_u32 s13, s13, 0 1930; GFX7-NEXT: s_waitcnt vmcnt(1) 1931; GFX7-NEXT: v_ashrrev_i32_e32 v1, 28, v2 1932; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4 1933; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4 1934; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 4 1935; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4 1936; GFX7-NEXT: v_bfe_i32 v7, v2, 8, 4 1937; GFX7-NEXT: v_bfe_i32 v8, v2, 4, 4 1938; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 4 1939; GFX7-NEXT: s_waitcnt vmcnt(0) 1940; GFX7-NEXT: v_ashrrev_i32_e32 v9, 28, v0 1941; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4 1942; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4 1943; GFX7-NEXT: v_bfe_i32 v12, v0, 16, 4 1944; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4 1945; GFX7-NEXT: v_bfe_i32 v14, v0, 8, 4 1946; GFX7-NEXT: v_bfe_i32 v15, v0, 4, 4 1947; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 4 1948; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1949; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, s4 1950; GFX7-NEXT: v_mad_i32_i24 v0, v8, v15, v0 1951; GFX7-NEXT: v_mad_i32_i24 v0, v7, v14, v0 1952; GFX7-NEXT: v_mad_i32_i24 v0, v6, v13, v0 1953; GFX7-NEXT: v_mad_i32_i24 v0, v5, v12, v0 1954; GFX7-NEXT: v_mad_i32_i24 v0, v4, v11, v0 1955; GFX7-NEXT: v_mad_i32_i24 v0, v3, v10, v0 1956; GFX7-NEXT: v_mad_i32_i24 v0, v1, v9, v0 1957; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1958; GFX7-NEXT: s_endpgm 1959; 1960; GFX8-LABEL: idot8_acc32_vecMul: 1961; GFX8: ; %bb.0: ; %entry 1962; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1963; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1964; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1965; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1966; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1967; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1968; GFX8-NEXT: v_mov_b32_e32 v1, s5 1969; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1970; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1971; GFX8-NEXT: flat_load_dword v3, v[0:1] 1972; GFX8-NEXT: v_mov_b32_e32 v1, s7 1973; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 1974; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1975; GFX8-NEXT: flat_load_dword v0, v[0:1] 1976; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 1977; GFX8-NEXT: s_mov_b32 s10, -1 1978; GFX8-NEXT: s_mov_b32 s11, 0xe80000 1979; GFX8-NEXT: s_add_u32 s8, s8, s3 1980; GFX8-NEXT: s_addc_u32 s9, s9, 0 1981; GFX8-NEXT: s_waitcnt vmcnt(1) 1982; GFX8-NEXT: v_ashrrev_i32_e32 v1, 28, v3 1983; GFX8-NEXT: v_bfe_i32 v2, v3, 24, 4 1984; GFX8-NEXT: v_bfe_i32 v4, v3, 20, 4 1985; GFX8-NEXT: v_bfe_i32 v5, v3, 16, 4 1986; GFX8-NEXT: v_bfe_i32 v6, v3, 12, 4 1987; GFX8-NEXT: v_bfe_i32 v7, v3, 8, 4 1988; GFX8-NEXT: v_bfe_i32 v8, v3, 4, 4 1989; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 4 1990; GFX8-NEXT: s_waitcnt vmcnt(0) 1991; GFX8-NEXT: v_ashrrev_i32_e32 v9, 28, v0 1992; GFX8-NEXT: v_bfe_i32 v10, v0, 24, 4 1993; GFX8-NEXT: v_bfe_i32 v11, v0, 20, 4 1994; GFX8-NEXT: v_bfe_i32 v12, v0, 16, 4 1995; GFX8-NEXT: v_bfe_i32 v13, v0, 12, 4 1996; GFX8-NEXT: v_bfe_i32 v14, v0, 8, 4 1997; GFX8-NEXT: v_bfe_i32 v15, v0, 4, 4 1998; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 4 1999; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2000; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, s2 2001; GFX8-NEXT: v_mad_i32_i24 v0, v8, v15, v0 2002; GFX8-NEXT: v_mad_i32_i24 v0, v7, v14, v0 2003; GFX8-NEXT: v_mad_i32_i24 v0, v6, v13, v0 2004; GFX8-NEXT: v_mad_i32_i24 v0, v5, v12, v0 2005; GFX8-NEXT: v_mad_i32_i24 v0, v4, v11, v0 2006; GFX8-NEXT: v_mad_i32_i24 v0, v2, v10, v0 2007; GFX8-NEXT: v_mad_i32_i24 v2, v1, v9, v0 2008; GFX8-NEXT: v_mov_b32_e32 v0, s0 2009; GFX8-NEXT: v_mov_b32_e32 v1, s1 2010; GFX8-NEXT: flat_store_dword v[0:1], v2 2011; GFX8-NEXT: s_endpgm 2012; 2013; GFX9-LABEL: idot8_acc32_vecMul: 2014; GFX9: ; %bb.0: ; %entry 2015; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2016; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2017; GFX9-NEXT: s_mov_b32 s10, -1 2018; GFX9-NEXT: s_mov_b32 s11, 0xe00000 2019; GFX9-NEXT: s_add_u32 s8, s8, s3 2020; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2021; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2022; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2023; GFX9-NEXT: s_addc_u32 s9, s9, 0 2024; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2025; GFX9-NEXT: global_load_dword v1, v0, s[4:5] 2026; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 2027; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 2028; GFX9-NEXT: v_mov_b32_e32 v0, 0 2029; GFX9-NEXT: s_waitcnt vmcnt(1) 2030; GFX9-NEXT: v_ashrrev_i32_e32 v3, 28, v1 2031; GFX9-NEXT: v_bfe_i32 v4, v1, 24, 4 2032; GFX9-NEXT: v_bfe_i32 v5, v1, 20, 4 2033; GFX9-NEXT: v_bfe_i32 v6, v1, 16, 4 2034; GFX9-NEXT: v_bfe_i32 v7, v1, 12, 4 2035; GFX9-NEXT: v_bfe_i32 v8, v1, 8, 4 2036; GFX9-NEXT: v_bfe_i32 v9, v1, 4, 4 2037; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 4 2038; GFX9-NEXT: s_waitcnt vmcnt(0) 2039; GFX9-NEXT: v_ashrrev_i32_e32 v10, 28, v2 2040; GFX9-NEXT: v_bfe_i32 v11, v2, 24, 4 2041; GFX9-NEXT: v_bfe_i32 v12, v2, 20, 4 2042; GFX9-NEXT: v_bfe_i32 v13, v2, 16, 4 2043; GFX9-NEXT: v_bfe_i32 v14, v2, 12, 4 2044; GFX9-NEXT: v_bfe_i32 v15, v2, 8, 4 2045; GFX9-NEXT: v_bfe_i32 v16, v2, 4, 4 2046; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 4 2047; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2 2048; GFX9-NEXT: v_mul_i32_i24_e32 v2, v9, v16 2049; GFX9-NEXT: v_mul_i32_i24_e32 v8, v8, v15 2050; GFX9-NEXT: v_mul_i32_i24_e32 v7, v7, v14 2051; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2052; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2 2053; GFX9-NEXT: v_mul_i32_i24_e32 v6, v6, v13 2054; GFX9-NEXT: v_mul_i32_i24_e32 v5, v5, v12 2055; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7 2056; GFX9-NEXT: v_mul_i32_i24_e32 v4, v4, v11 2057; GFX9-NEXT: v_mul_i32_i24_e32 v3, v3, v10 2058; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5 2059; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3 2060; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 2061; GFX9-NEXT: s_endpgm 2062; 2063; GFX9-DL-LABEL: idot8_acc32_vecMul: 2064; GFX9-DL: ; %bb.0: ; %entry 2065; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2066; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2067; GFX9-DL-NEXT: s_mov_b32 s10, -1 2068; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 2069; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 2070; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2071; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2072; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2073; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 2074; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2075; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 2076; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 2077; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 2078; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2079; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2080; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0 2081; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 2082; GFX9-DL-NEXT: s_endpgm 2083; 2084; GFX10-DL-XNACK-LABEL: idot8_acc32_vecMul: 2085; GFX10-DL-XNACK: ; %bb.0: ; %entry 2086; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2087; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2088; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2089; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2090; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2091; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 2092; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 2093; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 2094; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 2095; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) 2096; GFX10-DL-XNACK-NEXT: s_clause 0x1 2097; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] 2098; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] 2099; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 2100; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0 2101; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2102; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s2 2103; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[0:1] 2104; GFX10-DL-XNACK-NEXT: s_endpgm 2105; 2106; GFX10-DL-NOXNACK-LABEL: idot8_acc32_vecMul: 2107; GFX10-DL-NOXNACK: ; %bb.0: ; %entry 2108; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2109; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2110; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2111; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2112; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 2113; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2114; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 2115; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 2116; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 2117; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 2118; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) 2119; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 2120; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] 2121; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] 2122; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0 2123; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2124; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s2 2125; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1] 2126; GFX10-DL-NOXNACK-NEXT: s_endpgm 2127; GFX10-DL-LABEL: idot8_acc32_vecMul: 2128; GFX10-DL: ; %bb.0: ; %entry 2129; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2130; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2131; GFX10-DL-NEXT: s_mov_b32 s10, -1 2132; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 2133; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 2134; GFX10-DL-NEXT: s_clause 0x1 2135; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 2136; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2137; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 2138; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 2139; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2140; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 2141; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 2142; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 2143; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2144; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 2145; GFX10-DL-NEXT: v_dot8_i32_i4 v0, s0, s1, v0 2146; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 2147; GFX10-DL-NEXT: s_endpgm 2148 <8 x i4> addrspace(1)* %src2, 2149 i32 addrspace(1)* nocapture %dst) { 2150entry: 2151 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2152 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx 2153 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1 2154 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx 2155 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2 2156 2157 %cvec1 = sext <8 x i4> %vec1 to <8 x i32> 2158 %cvec2 = sext <8 x i4> %vec2 to <8 x i32> 2159 2160 %mul = mul <8 x i32> %cvec1, %cvec2 2161 %mul0 = extractelement <8 x i32> %mul, i64 0 2162 %mul1 = extractelement <8 x i32> %mul, i64 1 2163 %mul2 = extractelement <8 x i32> %mul, i64 2 2164 %mul3 = extractelement <8 x i32> %mul, i64 3 2165 %mul4 = extractelement <8 x i32> %mul, i64 4 2166 %mul5 = extractelement <8 x i32> %mul, i64 5 2167 %mul6 = extractelement <8 x i32> %mul, i64 6 2168 %mul7 = extractelement <8 x i32> %mul, i64 7 2169 2170 %acc = load i32, i32 addrspace(1)* %dst, align 4 2171 %add1 = add i32 %mul0, %acc 2172 %add2 = add i32 %add1, %mul1 2173 %add3 = add i32 %add2, %mul2 2174 %add4 = add i32 %add3, %mul3 2175 %add5 = add i32 %add4, %mul4 2176 %add6 = add i32 %add5, %mul5 2177 %add7 = add i32 %add6, %mul6 2178 %add8 = add i32 %add7, %mul7 2179 2180 store i32 %add8, i32 addrspace(1)* %dst, align 4 2181 ret void 2182} 2183 2184; TODO: Support this pattern. 2185define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, 2186; GFX7-LABEL: idot8_acc16_vecMul: 2187; GFX7: ; %bb.0: ; %entry 2188; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2189; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2190; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2191; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2192; GFX7-NEXT: s_mov_b32 s14, -1 2193; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 2194; GFX7-NEXT: s_add_u32 s12, s12, s3 2195; GFX7-NEXT: s_mov_b32 s3, 0xf000 2196; GFX7-NEXT: s_mov_b32 s10, 0 2197; GFX7-NEXT: s_mov_b32 s11, s3 2198; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2199; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 2200; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2201; GFX7-NEXT: v_mov_b32_e32 v1, 0 2202; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 2203; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 2204; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2205; GFX7-NEXT: s_mov_b32 s2, -1 2206; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 2207; GFX7-NEXT: s_mov_b32 s4, 0xffff 2208; GFX7-NEXT: s_addc_u32 s13, s13, 0 2209; GFX7-NEXT: s_waitcnt vmcnt(2) 2210; GFX7-NEXT: v_bfe_i32 v3, v2, 20, 4 2211; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 4 2212; GFX7-NEXT: v_bfe_i32 v5, v2, 4, 4 2213; GFX7-NEXT: v_bfe_i32 v6, v2, 0, 4 2214; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2215; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 2216; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 2217; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 2218; GFX7-NEXT: s_waitcnt vmcnt(1) 2219; GFX7-NEXT: v_bfe_i32 v10, v0, 20, 4 2220; GFX7-NEXT: v_bfe_i32 v11, v0, 16, 4 2221; GFX7-NEXT: v_bfe_i32 v12, v0, 4, 4 2222; GFX7-NEXT: v_bfe_i32 v13, v0, 0, 4 2223; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 2224; GFX7-NEXT: v_or_b32_e32 v4, v6, v5 2225; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v10 2226; GFX7-NEXT: v_and_b32_e32 v6, s4, v11 2227; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v12 2228; GFX7-NEXT: v_and_b32_e32 v11, s4, v13 2229; GFX7-NEXT: v_bfe_i32 v14, v0, 24, 4 2230; GFX7-NEXT: v_ashrrev_i32_e32 v16, 28, v0 2231; GFX7-NEXT: v_or_b32_e32 v5, v6, v5 2232; GFX7-NEXT: v_or_b32_e32 v6, v11, v10 2233; GFX7-NEXT: v_and_b32_e32 v12, s4, v14 2234; GFX7-NEXT: v_and_b32_e32 v14, s4, v16 2235; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v4 2236; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 2237; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v6 2238; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 2239; GFX7-NEXT: v_bfe_i32 v8, v2, 8, 4 2240; GFX7-NEXT: v_bfe_i32 v15, v0, 8, 4 2241; GFX7-NEXT: s_waitcnt vmcnt(0) 2242; GFX7-NEXT: v_mad_u32_u24 v1, v4, v6, v1 2243; GFX7-NEXT: v_bfe_i32 v7, v2, 24, 4 2244; GFX7-NEXT: v_ashrrev_i32_e32 v9, 28, v2 2245; GFX7-NEXT: v_bfe_i32 v2, v2, 12, 4 2246; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 2247; GFX7-NEXT: v_bfe_i32 v0, v0, 12, 4 2248; GFX7-NEXT: v_and_b32_e32 v13, s4, v15 2249; GFX7-NEXT: v_mad_u32_u24 v1, v16, v11, v1 2250; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 2251; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 2252; GFX7-NEXT: v_mad_u32_u24 v1, v8, v13, v1 2253; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v3 2254; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 2255; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v5 2256; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 2257; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 2258; GFX7-NEXT: v_mad_u32_u24 v0, v3, v5, v0 2259; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 2260; GFX7-NEXT: v_mad_u32_u24 v0, v15, v10, v0 2261; GFX7-NEXT: v_and_b32_e32 v9, s4, v9 2262; GFX7-NEXT: v_mad_u32_u24 v0, v7, v12, v0 2263; GFX7-NEXT: v_mad_u32_u24 v0, v9, v14, v0 2264; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 2265; GFX7-NEXT: s_endpgm 2266; 2267; GFX8-LABEL: idot8_acc16_vecMul: 2268; GFX8: ; %bb.0: ; %entry 2269; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2270; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2271; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2272; GFX8-NEXT: v_mov_b32_e32 v5, 12 2273; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2274; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2275; GFX8-NEXT: v_mov_b32_e32 v1, s5 2276; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 2277; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2278; GFX8-NEXT: flat_load_dword v3, v[0:1] 2279; GFX8-NEXT: v_mov_b32_e32 v1, s7 2280; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 2281; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2282; GFX8-NEXT: flat_load_dword v2, v[0:1] 2283; GFX8-NEXT: v_mov_b32_e32 v0, s0 2284; GFX8-NEXT: v_mov_b32_e32 v1, s1 2285; GFX8-NEXT: flat_load_ushort v4, v[0:1] 2286; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2287; GFX8-NEXT: s_mov_b32 s10, -1 2288; GFX8-NEXT: s_mov_b32 s11, 0xe80000 2289; GFX8-NEXT: s_add_u32 s8, s8, s3 2290; GFX8-NEXT: s_addc_u32 s9, s9, 0 2291; GFX8-NEXT: s_waitcnt vmcnt(2) 2292; GFX8-NEXT: v_lshrrev_b32_e32 v6, 4, v3 2293; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 2294; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3 2295; GFX8-NEXT: v_lshrrev_b32_e32 v9, 20, v3 2296; GFX8-NEXT: v_lshrrev_b32_e32 v10, 28, v3 2297; GFX8-NEXT: v_lshlrev_b16_sdwa v16, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2298; GFX8-NEXT: s_waitcnt vmcnt(1) 2299; GFX8-NEXT: v_lshrrev_b32_e32 v11, 4, v2 2300; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v2 2301; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2 2302; GFX8-NEXT: v_lshrrev_b32_e32 v14, 20, v2 2303; GFX8-NEXT: v_lshrrev_b32_e32 v15, 28, v2 2304; GFX8-NEXT: v_lshlrev_b16_sdwa v17, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2305; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v3 2306; GFX8-NEXT: v_lshlrev_b16_sdwa v18, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2307; GFX8-NEXT: v_lshlrev_b16_sdwa v5, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2308; GFX8-NEXT: v_lshlrev_b16_e32 v2, 12, v2 2309; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6 2310; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 2311; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11 2312; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2 2313; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7 2314; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 2315; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 2316; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 2317; GFX8-NEXT: s_waitcnt vmcnt(0) 2318; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 2319; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8 2320; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13 2321; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 2322; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 2323; GFX8-NEXT: v_mad_u16 v2, v6, v11, v2 2324; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 2325; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 2326; GFX8-NEXT: v_mad_u16 v2, v7, v12, v2 2327; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9 2328; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17 2329; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14 2330; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 2331; GFX8-NEXT: v_mad_u16 v2, v8, v13, v2 2332; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 2333; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 2334; GFX8-NEXT: v_mad_u16 v2, v17, v5, v2 2335; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10 2336; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16 2337; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15 2338; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18 2339; GFX8-NEXT: v_mad_u16 v2, v9, v14, v2 2340; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 2341; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15 2342; GFX8-NEXT: v_mad_u16 v2, v16, v18, v2 2343; GFX8-NEXT: v_mad_u16 v2, v10, v15, v2 2344; GFX8-NEXT: flat_store_short v[0:1], v2 2345; GFX8-NEXT: s_endpgm 2346; 2347; GFX9-LABEL: idot8_acc16_vecMul: 2348; GFX9: ; %bb.0: ; %entry 2349; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2350; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2351; GFX9-NEXT: s_mov_b32 s10, -1 2352; GFX9-NEXT: s_mov_b32 s11, 0xe00000 2353; GFX9-NEXT: s_add_u32 s8, s8, s3 2354; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2355; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2356; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2357; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff 2358; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2359; GFX9-NEXT: global_load_dword v1, v0, s[4:5] 2360; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 2361; GFX9-NEXT: v_mov_b32_e32 v0, 0 2362; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] 2363; GFX9-NEXT: s_addc_u32 s9, s9, 0 2364; GFX9-NEXT: s_waitcnt vmcnt(2) 2365; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1 2366; GFX9-NEXT: v_bfe_u32 v6, v1, 24, 4 2367; GFX9-NEXT: v_bfe_u32 v7, v1, 20, 4 2368; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4 2369; GFX9-NEXT: v_bfe_u32 v9, v1, 12, 4 2370; GFX9-NEXT: v_bfe_u32 v10, v1, 8, 4 2371; GFX9-NEXT: v_bfe_u32 v11, v1, 4, 4 2372; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 2373; GFX9-NEXT: s_waitcnt vmcnt(1) 2374; GFX9-NEXT: v_lshrrev_b32_e32 v12, 28, v2 2375; GFX9-NEXT: v_bfe_u32 v13, v2, 24, 4 2376; GFX9-NEXT: v_bfe_u32 v14, v2, 20, 4 2377; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 4 2378; GFX9-NEXT: v_bfe_u32 v16, v2, 12, 4 2379; GFX9-NEXT: v_bfe_u32 v17, v2, 8, 4 2380; GFX9-NEXT: v_bfe_u32 v18, v2, 4, 4 2381; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 2382; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 2383; GFX9-NEXT: v_and_b32_e32 v2, v4, v2 2384; GFX9-NEXT: v_lshl_or_b32 v1, v11, 16, v1 2385; GFX9-NEXT: v_lshl_or_b32 v2, v18, 16, v2 2386; GFX9-NEXT: v_and_b32_e32 v10, v4, v10 2387; GFX9-NEXT: v_and_b32_e32 v6, v4, v6 2388; GFX9-NEXT: v_and_b32_e32 v17, v4, v17 2389; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] 2390; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] 2391; GFX9-NEXT: v_lshl_or_b32 v9, v9, 16, v10 2392; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v6 2393; GFX9-NEXT: v_lshl_or_b32 v6, v16, 16, v17 2394; GFX9-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] 2395; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] 2396; GFX9-NEXT: v_and_b32_e32 v8, v4, v8 2397; GFX9-NEXT: v_and_b32_e32 v15, v4, v15 2398; GFX9-NEXT: v_and_b32_e32 v4, v4, v13 2399; GFX9-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1] 2400; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] 2401; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 2402; GFX9-NEXT: v_lshl_or_b32 v7, v7, 16, v8 2403; GFX9-NEXT: v_lshl_or_b32 v8, v14, 16, v15 2404; GFX9-NEXT: v_lshl_or_b32 v4, v12, 16, v4 2405; GFX9-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] 2406; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] 2407; GFX9-NEXT: s_waitcnt vmcnt(0) 2408; GFX9-NEXT: v_add_u16_e32 v2, v1, v3 2409; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] 2410; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] 2411; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1] 2412; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1] 2413; GFX9-NEXT: v_pk_mul_lo_u16 v6, v9, v6 2414; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2415; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] 2416; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] 2417; GFX9-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] 2418; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] 2419; GFX9-NEXT: v_add_u16_e32 v1, v1, v6 2420; GFX9-NEXT: v_pk_mul_lo_u16 v4, v5, v4 2421; GFX9-NEXT: v_pk_mul_lo_u16 v5, v7, v8 2422; GFX9-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2423; GFX9-NEXT: v_add_u16_e32 v1, v1, v5 2424; GFX9-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2425; GFX9-NEXT: v_add_u16_e32 v1, v1, v4 2426; GFX9-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2427; GFX9-NEXT: global_store_short v0, v1, s[2:3] 2428; GFX9-NEXT: s_endpgm 2429; 2430; GFX9-DL-LABEL: idot8_acc16_vecMul: 2431; GFX9-DL: ; %bb.0: ; %entry 2432; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2433; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2434; GFX9-DL-NEXT: s_mov_b32 s10, -1 2435; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 2436; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 2437; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2438; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2439; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2440; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0xffff 2441; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2442; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 2443; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 2444; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2445; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] 2446; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 2447; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 2448; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1 2449; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 24, 4 2450; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 20, 4 2451; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4 2452; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 2453; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 8, 4 2454; GFX9-DL-NEXT: v_bfe_u32 v11, v1, 4, 4 2455; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 2456; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2457; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 28, v2 2458; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 24, 4 2459; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 20, 4 2460; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 16, 4 2461; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 12, 4 2462; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 8, 4 2463; GFX9-DL-NEXT: v_bfe_u32 v18, v2, 4, 4 2464; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 2465; GFX9-DL-NEXT: v_and_b32_e32 v1, v4, v1 2466; GFX9-DL-NEXT: v_and_b32_e32 v2, v4, v2 2467; GFX9-DL-NEXT: v_lshl_or_b32 v1, v11, 16, v1 2468; GFX9-DL-NEXT: v_lshl_or_b32 v2, v18, 16, v2 2469; GFX9-DL-NEXT: v_and_b32_e32 v10, v4, v10 2470; GFX9-DL-NEXT: v_and_b32_e32 v6, v4, v6 2471; GFX9-DL-NEXT: v_and_b32_e32 v17, v4, v17 2472; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] 2473; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] 2474; GFX9-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v10 2475; GFX9-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 2476; GFX9-DL-NEXT: v_lshl_or_b32 v6, v16, 16, v17 2477; GFX9-DL-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] 2478; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] 2479; GFX9-DL-NEXT: v_and_b32_e32 v8, v4, v8 2480; GFX9-DL-NEXT: v_and_b32_e32 v15, v4, v15 2481; GFX9-DL-NEXT: v_and_b32_e32 v4, v4, v13 2482; GFX9-DL-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1] 2483; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] 2484; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 2485; GFX9-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8 2486; GFX9-DL-NEXT: v_lshl_or_b32 v8, v14, 16, v15 2487; GFX9-DL-NEXT: v_lshl_or_b32 v4, v12, 16, v4 2488; GFX9-DL-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] 2489; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] 2490; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2491; GFX9-DL-NEXT: v_add_u16_e32 v2, v1, v3 2492; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] 2493; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] 2494; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1] 2495; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1] 2496; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, v9, v6 2497; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2498; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] 2499; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] 2500; GFX9-DL-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] 2501; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] 2502; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 2503; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 2504; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v7, v8 2505; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2506; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v5 2507; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2508; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v4 2509; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2510; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] 2511; GFX9-DL-NEXT: s_endpgm 2512; 2513; GFX10-DL-XNACK-LABEL: idot8_acc16_vecMul: 2514; GFX10-DL-XNACK: ; %bb.0: ; %entry 2515; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2516; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2517; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2518; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0xffff 2519; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2520; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2521; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 2522; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 2523; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 2524; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 2525; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) 2526; GFX10-DL-XNACK-NEXT: s_clause 0x1 2527; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] 2528; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] 2529; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 2530; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1] 2531; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) 2532; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 28, v1 2533; GFX10-DL-XNACK-NEXT: v_bfe_u32 v6, v1, 24, 4 2534; GFX10-DL-XNACK-NEXT: v_bfe_u32 v7, v1, 20, 4 2535; GFX10-DL-XNACK-NEXT: v_bfe_u32 v8, v1, 16, 4 2536; GFX10-DL-XNACK-NEXT: v_bfe_u32 v9, v1, 12, 4 2537; GFX10-DL-XNACK-NEXT: v_bfe_u32 v10, v1, 8, 4 2538; GFX10-DL-XNACK-NEXT: v_bfe_u32 v11, v1, 4, 4 2539; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v1, 15, v1 2540; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) 2541; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, 15, v2 2542; GFX10-DL-XNACK-NEXT: v_bfe_u32 v16, v2, 4, 4 2543; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 28, v2 2544; GFX10-DL-XNACK-NEXT: v_bfe_u32 v14, v2, 24, 4 2545; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v1, v4, v1 2546; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, v4, v13 2547; GFX10-DL-XNACK-NEXT: v_bfe_u32 v15, v2, 20, 4 2548; GFX10-DL-XNACK-NEXT: v_bfe_u32 v17, v2, 16, 4 2549; GFX10-DL-XNACK-NEXT: v_bfe_u32 v18, v2, 12, 4 2550; GFX10-DL-XNACK-NEXT: v_bfe_u32 v2, v2, 8, 4 2551; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v11, 16, v1 2552; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v11, v16, 16, v13 2553; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v10, v4, v10 2554; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v8, v4, v8 2555; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v2, v4, v2 2556; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] 2557; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v11, 12, v11 op_sel_hi:[0,1] 2558; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v9, v9, 16, v10 2559; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v7, v7, 16, v8 2560; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v2, v18, 16, v2 2561; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] 2562; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v8, 12, v11 op_sel_hi:[0,1] 2563; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v10, v4, v17 2564; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1] 2565; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] 2566; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] 2567; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v8 2568; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v8, v15, 16, v10 2569; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] 2570; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] 2571; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] 2572; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1 2573; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) 2574; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3 2575; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v3, v4, v6 2576; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v6, 12, v8 op_sel_hi:[0,1] 2577; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v4, v4, v14 2578; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v9, v2 2579; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v10 2580; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v3, v5, 16, v3 2581; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1] 2582; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v4, v12, 16, v4 2583; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 16, v2 2584; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v2 2585; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v2, 12, v3 op_sel_hi:[0,1] 2586; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v3, 12, v4 op_sel_hi:[0,1] 2587; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v7, v5 2588; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v6 2589; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] 2590; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] 2591; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 2592; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v4 2593; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v2, v3 2594; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v5 2595; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v2 2596; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v2 2597; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3 2598; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1] 2599; GFX10-DL-XNACK-NEXT: s_endpgm 2600; 2601; GFX10-DL-NOXNACK-LABEL: idot8_acc16_vecMul: 2602; GFX10-DL-NOXNACK: ; %bb.0: ; %entry 2603; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2604; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2605; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2606; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 2607; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0xffff 2608; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2609; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2610; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 2611; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 2612; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 2613; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 2614; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) 2615; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 2616; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] 2617; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] 2618; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1] 2619; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) 2620; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 28, v1 2621; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v6, v1, 24, 4 2622; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v7, v1, 20, 4 2623; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v8, v1, 16, 4 2624; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v9, v1, 12, 4 2625; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v10, v1, 8, 4 2626; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v11, v1, 4, 4 2627; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v1, 15, v1 2628; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) 2629; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, 15, v0 2630; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v16, v0, 4, 4 2631; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 28, v0 2632; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v14, v0, 24, 4 2633; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v1, v4, v1 2634; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, v4, v13 2635; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v15, v0, 20, 4 2636; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v17, v0, 16, 4 2637; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v18, v0, 12, 4 2638; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v0, v0, 8, 4 2639; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v11, 16, v1 2640; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v11, v16, 16, v13 2641; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v10, v4, v10 2642; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v8, v4, v8 2643; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v0, v4, v0 2644; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] 2645; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v11, 12, v11 op_sel_hi:[0,1] 2646; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v9, v9, 16, v10 2647; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v7, v7, 16, v8 2648; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v0, v18, 16, v0 2649; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] 2650; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v8, 12, v11 op_sel_hi:[0,1] 2651; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v10, v4, v17 2652; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1] 2653; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1] 2654; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] 2655; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v8 2656; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v8, v15, 16, v10 2657; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] 2658; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] 2659; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] 2660; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1 2661; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) 2662; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v1, v3 2663; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v3, v4, v6 2664; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v6, 12, v8 op_sel_hi:[0,1] 2665; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v4, v4, v14 2666; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v9, v0 2667; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v1, v10 2668; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v3, v5, 16, v3 2669; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1] 2670; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v4, v12, 16, v4 2671; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 16, v0 2672; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v1, v0 2673; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v3 op_sel_hi:[0,1] 2674; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v3, 12, v4 op_sel_hi:[0,1] 2675; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v7, v5 2676; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v6 2677; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] 2678; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] 2679; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 2680; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v4 2681; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v3 2682; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v5 2683; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2684; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1 2685; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v3 2686; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1] 2687; GFX10-DL-NOXNACK-NEXT: s_endpgm 2688; GFX10-DL-LABEL: idot8_acc16_vecMul: 2689; GFX10-DL: ; %bb.0: ; %entry 2690; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 2691; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 2692; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2693; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2694; GFX10-DL-NEXT: s_mov_b32 s14, -1 2695; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 2696; GFX10-DL-NEXT: s_add_u32 s12, s12, s3 2697; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2698; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 2699; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2700; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] 2701; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 2702; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 2703; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2704; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 2705; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 28 2706; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 2707; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x40014 2708; GFX10-DL-NEXT: s_bfe_u32 s8, s0, 0x40008 2709; GFX10-DL-NEXT: s_bfe_u32 s9, s0, 0x4000c 2710; GFX10-DL-NEXT: s_and_b32 s10, s0, 15 2711; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x40004 2712; GFX10-DL-NEXT: s_and_b32 s11, s1, 15 2713; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s10, s0 2714; GFX10-DL-NEXT: s_bfe_u32 s10, s1, 0x40004 2715; GFX10-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s0 op_sel_hi:[0,1] 2716; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s11, s10 2717; GFX10-DL-NEXT: s_bfe_u32 s11, s1, 0x4000c 2718; GFX10-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] 2719; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40008 2720; GFX10-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] 2721; GFX10-DL-NEXT: s_pack_ll_b32_b16 s8, s8, s9 2722; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s11 2723; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] 2724; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s8 op_sel_hi:[0,1] 2725; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] 2726; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40010 2727; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40014 2728; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v3 2729; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v4 op_sel_hi:[0,1] 2730; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1] 2731; GFX10-DL-NEXT: s_pack_ll_b32_b16 s6, s6, s7 2732; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s8, s0 2733; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s6 op_sel_hi:[0,1] 2734; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] 2735; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 2736; GFX10-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 2737; GFX10-DL-NEXT: s_lshr_b32 s0, s1, 28 2738; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s2, s3 2739; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1] 2740; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s10, s0 2741; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] 2742; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2743; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1 2744; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2745; GFX10-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v5 op_sel_hi:[0,1] 2746; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s1 op_sel_hi:[0,1] 2747; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 2748; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v4 2749; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1] 2750; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2751; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v5 op_sel_hi:[0,1] 2752; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 2753; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 2754; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2755; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v3 2756; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2757; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] 2758; GFX10-DL-NEXT: s_endpgm 2759 <8 x i4> addrspace(1)* %src2, 2760 i16 addrspace(1)* nocapture %dst) { 2761entry: 2762 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2763 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx 2764 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1 2765 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx 2766 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2 2767 2768 %cvec1 = sext <8 x i4> %vec1 to <8 x i16> 2769 %cvec2 = sext <8 x i4> %vec2 to <8 x i16> 2770 2771 %mul = mul <8 x i16> %cvec1, %cvec2 2772 %mul0 = extractelement <8 x i16> %mul, i64 0 2773 %mul1 = extractelement <8 x i16> %mul, i64 1 2774 %mul2 = extractelement <8 x i16> %mul, i64 2 2775 %mul3 = extractelement <8 x i16> %mul, i64 3 2776 %mul4 = extractelement <8 x i16> %mul, i64 4 2777 %mul5 = extractelement <8 x i16> %mul, i64 5 2778 %mul6 = extractelement <8 x i16> %mul, i64 6 2779 %mul7 = extractelement <8 x i16> %mul, i64 7 2780 2781 %acc = load i16, i16 addrspace(1)* %dst, align 4 2782 %add1 = add i16 %mul0, %acc 2783 %add2 = add i16 %add1, %mul1 2784 %add3 = add i16 %add2, %mul2 2785 %add4 = add i16 %add3, %mul3 2786 %add5 = add i16 %add4, %mul4 2787 %add6 = add i16 %add5, %mul5 2788 %add7 = add i16 %add6, %mul6 2789 %add8 = add i16 %add7, %mul7 2790 2791 store i16 %add8, i16 addrspace(1)* %dst, align 4 2792 ret void 2793} 2794 2795; TODO: Support this pattern. 2796define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, 2797; GFX7-LABEL: idot8_acc8_vecMul: 2798; GFX7: ; %bb.0: ; %entry 2799; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2800; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2801; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2802; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2803; GFX7-NEXT: s_mov_b32 s14, -1 2804; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 2805; GFX7-NEXT: s_add_u32 s12, s12, s3 2806; GFX7-NEXT: s_mov_b32 s3, 0xf000 2807; GFX7-NEXT: s_mov_b32 s10, 0 2808; GFX7-NEXT: s_mov_b32 s11, s3 2809; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2810; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 2811; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2812; GFX7-NEXT: v_mov_b32_e32 v1, 0 2813; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 2814; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 2815; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2816; GFX7-NEXT: s_mov_b32 s2, -1 2817; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 2818; GFX7-NEXT: s_movk_i32 s4, 0xff 2819; GFX7-NEXT: s_mov_b32 s5, 0xffff 2820; GFX7-NEXT: s_addc_u32 s13, s13, 0 2821; GFX7-NEXT: s_waitcnt vmcnt(2) 2822; GFX7-NEXT: v_ashrrev_i32_e32 v3, 28, v2 2823; GFX7-NEXT: v_bfe_i32 v4, v2, 24, 4 2824; GFX7-NEXT: v_bfe_i32 v5, v2, 20, 4 2825; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4 2826; GFX7-NEXT: v_bfe_i32 v7, v2, 12, 4 2827; GFX7-NEXT: v_bfe_i32 v8, v2, 8, 4 2828; GFX7-NEXT: v_bfe_i32 v9, v2, 4, 4 2829; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 4 2830; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v3 2831; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 2832; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 2833; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 2834; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 2835; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 2836; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 2837; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 2838; GFX7-NEXT: s_waitcnt vmcnt(1) 2839; GFX7-NEXT: v_ashrrev_i32_e32 v11, 28, v0 2840; GFX7-NEXT: v_bfe_i32 v12, v0, 24, 4 2841; GFX7-NEXT: v_bfe_i32 v13, v0, 20, 4 2842; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4 2843; GFX7-NEXT: v_bfe_i32 v15, v0, 12, 4 2844; GFX7-NEXT: v_bfe_i32 v16, v0, 8, 4 2845; GFX7-NEXT: v_bfe_i32 v17, v0, 4, 4 2846; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 4 2847; GFX7-NEXT: v_or_b32_e32 v4, v4, v10 2848; GFX7-NEXT: v_or_b32_e32 v5, v6, v5 2849; GFX7-NEXT: v_or_b32_e32 v6, v8, v7 2850; GFX7-NEXT: v_or_b32_e32 v2, v2, v9 2851; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v11 2852; GFX7-NEXT: v_and_b32_e32 v8, s4, v12 2853; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v13 2854; GFX7-NEXT: v_and_b32_e32 v10, s4, v14 2855; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v15 2856; GFX7-NEXT: v_and_b32_e32 v13, s4, v16 2857; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v17 2858; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 2859; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 2860; GFX7-NEXT: v_and_b32_e32 v5, s5, v5 2861; GFX7-NEXT: v_or_b32_e32 v7, v8, v7 2862; GFX7-NEXT: v_or_b32_e32 v8, v10, v9 2863; GFX7-NEXT: v_or_b32_e32 v9, v13, v12 2864; GFX7-NEXT: v_or_b32_e32 v0, v0, v14 2865; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 2866; GFX7-NEXT: v_and_b32_e32 v2, s5, v2 2867; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 2868; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 2869; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v9 2870; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 2871; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 2872; GFX7-NEXT: v_or_b32_e32 v0, v0, v7 2873; GFX7-NEXT: v_and_b32_e32 v7, s4, v2 2874; GFX7-NEXT: v_and_b32_e32 v13, s4, v0 2875; GFX7-NEXT: v_and_b32_e32 v6, s5, v8 2876; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 8 2877; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 8 2878; GFX7-NEXT: s_waitcnt vmcnt(0) 2879; GFX7-NEXT: v_mad_u32_u24 v1, v7, v13, v1 2880; GFX7-NEXT: v_or_b32_e32 v5, v6, v5 2881; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 2882; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 2883; GFX7-NEXT: v_lshrrev_b32_e32 v12, 24, v0 2884; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 2885; GFX7-NEXT: v_mad_u32_u24 v1, v8, v14, v1 2886; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 2887; GFX7-NEXT: v_and_b32_e32 v9, s4, v4 2888; GFX7-NEXT: v_and_b32_e32 v15, s4, v5 2889; GFX7-NEXT: v_mad_u32_u24 v0, v6, v12, v0 2890; GFX7-NEXT: v_bfe_u32 v10, v4, 8, 8 2891; GFX7-NEXT: v_bfe_u32 v16, v5, 8, 8 2892; GFX7-NEXT: v_mad_u32_u24 v0, v9, v15, v0 2893; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 2894; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 2895; GFX7-NEXT: v_mad_u32_u24 v0, v10, v16, v0 2896; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 2897; GFX7-NEXT: v_and_b32_e32 v11, s4, v11 2898; GFX7-NEXT: v_mad_u32_u24 v0, v4, v5, v0 2899; GFX7-NEXT: v_mad_u32_u24 v0, v3, v11, v0 2900; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 2901; GFX7-NEXT: s_endpgm 2902; 2903; GFX8-LABEL: idot8_acc8_vecMul: 2904; GFX8: ; %bb.0: ; %entry 2905; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2906; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2907; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2908; GFX8-NEXT: v_mov_b32_e32 v5, 12 2909; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2910; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2911; GFX8-NEXT: v_mov_b32_e32 v1, s5 2912; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 2913; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2914; GFX8-NEXT: flat_load_dword v3, v[0:1] 2915; GFX8-NEXT: v_mov_b32_e32 v1, s7 2916; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 2917; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2918; GFX8-NEXT: flat_load_dword v2, v[0:1] 2919; GFX8-NEXT: v_mov_b32_e32 v0, s0 2920; GFX8-NEXT: v_mov_b32_e32 v1, s1 2921; GFX8-NEXT: flat_load_ubyte v4, v[0:1] 2922; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2923; GFX8-NEXT: s_mov_b32 s10, -1 2924; GFX8-NEXT: s_mov_b32 s11, 0xe80000 2925; GFX8-NEXT: s_add_u32 s8, s8, s3 2926; GFX8-NEXT: s_addc_u32 s9, s9, 0 2927; GFX8-NEXT: s_waitcnt vmcnt(2) 2928; GFX8-NEXT: v_lshrrev_b32_e32 v6, 20, v3 2929; GFX8-NEXT: v_lshrrev_b32_e32 v7, 28, v3 2930; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3 2931; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 2932; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 2933; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3 2934; GFX8-NEXT: s_waitcnt vmcnt(1) 2935; GFX8-NEXT: v_lshrrev_b32_e32 v11, 20, v2 2936; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 2937; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2 2938; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2 2939; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2 2940; GFX8-NEXT: v_lshlrev_b16_sdwa v17, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2941; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2942; GFX8-NEXT: v_lshlrev_b16_e32 v18, 12, v2 2943; GFX8-NEXT: v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2944; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2945; GFX8-NEXT: v_lshlrev_b16_e32 v5, 12, v10 2946; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v16 2947; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v17 2948; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7 2949; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v3 2950; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v6 2951; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v15 2952; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v18 2953; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v19 2954; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 2955; GFX8-NEXT: v_ashrrev_i16_e32 v19, 12, v2 2956; GFX8-NEXT: v_lshlrev_b16_e32 v2, 12, v11 2957; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9 2958; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8 2959; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14 2960; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13 2961; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 2962; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 2963; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 2964; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2 2965; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 2966; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 2967; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v14 2968; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 2969; GFX8-NEXT: v_mul_lo_u16_e32 v10, v10, v15 2970; GFX8-NEXT: v_mul_lo_u16_e32 v15, v16, v18 2971; GFX8-NEXT: v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2972; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2973; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 2974; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 2975; GFX8-NEXT: v_mul_lo_u16_e32 v14, v17, v19 2976; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2977; GFX8-NEXT: v_mul_lo_u16_e32 v8, v9, v11 2978; GFX8-NEXT: v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2979; GFX8-NEXT: v_mul_lo_u16_sdwa v5, v5, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2980; GFX8-NEXT: v_or_b32_sdwa v6, v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2981; GFX8-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2982; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3 2983; GFX8-NEXT: v_or_b32_sdwa v8, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2984; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7 2985; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 2986; GFX8-NEXT: v_or_b32_sdwa v3, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2987; GFX8-NEXT: v_or_b32_e32 v5, v5, v2 2988; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v3 2989; GFX8-NEXT: v_lshrrev_b64 v[2:3], 24, v[2:3] 2990; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v5 2991; GFX8-NEXT: s_waitcnt vmcnt(0) 2992; GFX8-NEXT: v_add_u16_e32 v3, v8, v4 2993; GFX8-NEXT: v_add_u16_e32 v3, v3, v5 2994; GFX8-NEXT: v_add_u16_e32 v3, v3, v7 2995; GFX8-NEXT: v_add_u16_e32 v2, v3, v2 2996; GFX8-NEXT: v_mad_u16 v2, v17, v19, v2 2997; GFX8-NEXT: v_add_u16_e32 v2, v2, v6 2998; GFX8-NEXT: v_mad_u16 v2, v16, v18, v2 2999; GFX8-NEXT: v_add_u16_e32 v2, v2, v10 3000; GFX8-NEXT: flat_store_byte v[0:1], v2 3001; GFX8-NEXT: s_endpgm 3002; 3003; GFX9-LABEL: idot8_acc8_vecMul: 3004; GFX9: ; %bb.0: ; %entry 3005; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 3006; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 3007; GFX9-NEXT: s_mov_b32 s10, -1 3008; GFX9-NEXT: s_mov_b32 s11, 0xe00000 3009; GFX9-NEXT: s_add_u32 s8, s8, s3 3010; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 3011; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 3012; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3013; GFX9-NEXT: v_mov_b32_e32 v3, 0 3014; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3015; GFX9-NEXT: global_load_dword v1, v0, s[4:5] 3016; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 3017; GFX9-NEXT: global_load_ubyte v4, v3, s[2:3] 3018; GFX9-NEXT: v_mov_b32_e32 v0, 12 3019; GFX9-NEXT: s_addc_u32 s9, s9, 0 3020; GFX9-NEXT: s_waitcnt vmcnt(2) 3021; GFX9-NEXT: v_lshrrev_b32_e32 v5, 20, v1 3022; GFX9-NEXT: v_lshrrev_b32_e32 v6, 28, v1 3023; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1 3024; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 3025; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 3026; GFX9-NEXT: s_waitcnt vmcnt(1) 3027; GFX9-NEXT: v_lshrrev_b32_e32 v10, 20, v2 3028; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 3029; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2 3030; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v1 3031; GFX9-NEXT: v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 3032; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3033; GFX9-NEXT: v_lshlrev_b16_e32 v17, 12, v2 3034; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 3035; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3036; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2 3037; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 3038; GFX9-NEXT: v_lshlrev_b16_e32 v2, 12, v9 3039; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v15 3040; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v16 3041; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6 3042; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v1 3043; GFX9-NEXT: v_lshlrev_b16_e32 v1, 12, v5 3044; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v14 3045; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v17 3046; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v18 3047; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11 3048; GFX9-NEXT: v_ashrrev_i16_e32 v18, 12, v0 3049; GFX9-NEXT: v_lshlrev_b16_e32 v0, 12, v10 3050; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8 3051; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7 3052; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13 3053; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12 3054; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 3055; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1 3056; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 3057; GFX9-NEXT: v_ashrrev_i16_e32 v0, 12, v0 3058; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 3059; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 3060; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v13 3061; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 3062; GFX9-NEXT: v_mul_lo_u16_e32 v19, v15, v17 3063; GFX9-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3064; GFX9-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3065; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 3066; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 3067; GFX9-NEXT: v_mul_lo_u16_e32 v13, v16, v18 3068; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3069; GFX9-NEXT: v_mul_lo_u16_e32 v7, v8, v10 3070; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3071; GFX9-NEXT: v_mul_lo_u16_e32 v9, v9, v14 3072; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3073; GFX9-NEXT: v_or_b32_sdwa v5, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3074; GFX9-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3075; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v1 3076; GFX9-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3077; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v6 3078; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v1 3079; GFX9-NEXT: v_or_b32_sdwa v1, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 3080; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 3081; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 3082; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] 3083; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v2 3084; GFX9-NEXT: s_waitcnt vmcnt(0) 3085; GFX9-NEXT: v_add_u16_e32 v1, v7, v4 3086; GFX9-NEXT: v_add_u16_e32 v1, v1, v2 3087; GFX9-NEXT: v_add_u16_e32 v1, v1, v6 3088; GFX9-NEXT: v_add_u16_e32 v0, v1, v0 3089; GFX9-NEXT: v_mad_legacy_u16 v0, v16, v18, v0 3090; GFX9-NEXT: v_add_u16_e32 v0, v0, v5 3091; GFX9-NEXT: v_mad_legacy_u16 v0, v15, v17, v0 3092; GFX9-NEXT: v_add_u16_e32 v0, v0, v9 3093; GFX9-NEXT: global_store_byte v3, v0, s[2:3] 3094; GFX9-NEXT: s_endpgm 3095; 3096; GFX9-DL-LABEL: idot8_acc8_vecMul: 3097; GFX9-DL: ; %bb.0: ; %entry 3098; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 3099; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 3100; GFX9-DL-NEXT: s_mov_b32 s10, -1 3101; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 3102; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 3103; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 3104; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 3105; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3106; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0 3107; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 3108; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 3109; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 3110; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[2:3] 3111; GFX9-DL-NEXT: v_mov_b32_e32 v0, 12 3112; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 3113; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 3114; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v1 3115; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 3116; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1 3117; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 3118; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 3119; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 3120; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 20, v2 3121; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 3122; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2 3123; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v1 3124; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 3125; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3126; GFX9-DL-NEXT: v_lshlrev_b16_e32 v17, 12, v2 3127; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 3128; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3129; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2 3130; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 3131; GFX9-DL-NEXT: v_lshlrev_b16_e32 v2, 12, v9 3132; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v15 3133; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v16 3134; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6 3135; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v1 3136; GFX9-DL-NEXT: v_lshlrev_b16_e32 v1, 12, v5 3137; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v14 3138; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v17 3139; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18 3140; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11 3141; GFX9-DL-NEXT: v_ashrrev_i16_e32 v18, 12, v0 3142; GFX9-DL-NEXT: v_lshlrev_b16_e32 v0, 12, v10 3143; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8 3144; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7 3145; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13 3146; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12 3147; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 3148; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1 3149; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 3150; GFX9-DL-NEXT: v_ashrrev_i16_e32 v0, 12, v0 3151; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 3152; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 3153; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v13 3154; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 3155; GFX9-DL-NEXT: v_mul_lo_u16_e32 v19, v15, v17 3156; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3157; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3158; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 3159; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 3160; GFX9-DL-NEXT: v_mul_lo_u16_e32 v13, v16, v18 3161; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3162; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v8, v10 3163; GFX9-DL-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3164; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, v9, v14 3165; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3166; GFX9-DL-NEXT: v_or_b32_sdwa v5, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3167; GFX9-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3168; GFX9-DL-NEXT: v_lshlrev_b32_e32 v8, 16, v1 3169; GFX9-DL-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3170; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6 3171; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v1 3172; GFX9-DL-NEXT: v_or_b32_sdwa v1, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 3173; GFX9-DL-NEXT: v_or_b32_e32 v2, v2, v0 3174; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 3175; GFX9-DL-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] 3176; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v2 3177; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 3178; GFX9-DL-NEXT: v_add_u16_e32 v1, v7, v4 3179; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v2 3180; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 3181; GFX9-DL-NEXT: v_add_u16_e32 v0, v1, v0 3182; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v16, v18, v0 3183; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v5 3184; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v15, v17, v0 3185; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v9 3186; GFX9-DL-NEXT: global_store_byte v3, v0, s[2:3] 3187; GFX9-DL-NEXT: s_endpgm 3188; 3189; GFX10-DL-XNACK-LABEL: idot8_acc8_vecMul: 3190; GFX10-DL-XNACK: ; %bb.0: ; %entry 3191; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 3192; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3193; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 3194; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0 3195; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 3196; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 3197; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 3198; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 3199; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 3200; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 3201; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) 3202; GFX10-DL-XNACK-NEXT: s_clause 0x1 3203; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] 3204; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] 3205; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v4, s[0:1] 3206; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) 3207; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 3208; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) 3209; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2 3210; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 3211; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 8, v2 3212; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 3213; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 3214; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15 3215; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v2 3216; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 3217; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 4, v2 3218; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 3219; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 3220; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 3221; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v15 3222; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v0, 20, v1 3223; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1 3224; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v2 3225; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v2 3226; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 3227; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v13 3228; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 3229; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v17 3230; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 3231; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16 3232; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v8, v8, v15 3233; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1 3234; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v2 3235; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 3236; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v0, 12, v0 3237; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14 3238; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v11 3239; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 3240; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v13, 12, v13 3241; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 3242; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 3243; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 3244; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v17 3245; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v9, v16 3246; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 8, v8 3247; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 3248; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v12 3249; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 3250; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v0, 12, v0 3251; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v14 3252; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11 3253; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v6, v6, v13 3254; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 3255; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 3256; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v10, v10, v15 3257; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3258; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 3259; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12 3260; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v0, v11 3261; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v11, v7, v14 3262; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 8, v6 3263; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 8, v10 3264; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8 3265; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v1, v1, v2 3266; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v2, v5, v12 3267; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 8, v9 3268; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v6, v11, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3269; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v11, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 3270; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3271; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3272; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v9, 16, v6 3273; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v11 3274; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) 3275; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v1, v3 3276; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 3277; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v9, v3, v10 3278; GFX10-DL-XNACK-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] 3279; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v1 3280; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v9, v8 3281; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v2 3282; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v5, v12, v0 3283; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v1 3284; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6 3285; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v7, v14, v0 3286; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v1 3287; GFX10-DL-XNACK-NEXT: global_store_byte v4, v0, s[0:1] 3288; GFX10-DL-XNACK-NEXT: s_endpgm 3289; 3290; GFX10-DL-NOXNACK-LABEL: idot8_acc8_vecMul: 3291; GFX10-DL-NOXNACK: ; %bb.0: ; %entry 3292; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 3293; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3294; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 3295; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0 3296; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 3297; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 3298; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 3299; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 3300; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 3301; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 3302; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) 3303; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 3304; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] 3305; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] 3306; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v2, v4, s[0:1] 3307; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) 3308; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 3309; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) 3310; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0 3311; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 3312; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v0 3313; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v0 3314; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v0 3315; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v0 3316; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 8, v0 3317; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 4, v0 3318; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 3319; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 3320; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15 3321; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 3322; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 3323; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 3324; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v18, 12, v0 3325; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v16 3326; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 3327; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v15 3328; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 20, v1 3329; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1 3330; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 3331; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13 3332; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 3333; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v17 3334; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 3335; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 3336; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v8, v8, v15 3337; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1 3338; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 3339; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 12, v3 3340; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14 3341; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11 3342; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 3343; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v13 3344; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 3345; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 3346; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v17 3347; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v0, v9, v0 3348; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 8, v8 3349; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 3350; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v12 3351; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 3352; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v3 3353; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v14 3354; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v11 3355; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v6, v6, v13 3356; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 3357; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v10, v10, v15 3358; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3359; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 3360; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v12 3361; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v3, v3, v9 3362; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v9, v7, v14 3363; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 8, v6 3364; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 8, v10 3365; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8 3366; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v1, v1, v18 3367; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v12, v5, v11 3368; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 8, v3 3369; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3370; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v9, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 3371; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3372; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3373; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v10, 16, v6 3374; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v9 3375; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) 3376; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v2, v1, v2 3377; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v3, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 3378; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v9, v2, v9 3379; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] 3380; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v1 3381; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v9, v8 3382; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v2 3383; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v5, v11, v0 3384; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1 3385; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6 3386; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v7, v14, v0 3387; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1 3388; GFX10-DL-NOXNACK-NEXT: global_store_byte v4, v0, s[0:1] 3389; GFX10-DL-NOXNACK-NEXT: s_endpgm 3390; GFX10-DL-LABEL: idot8_acc8_vecMul: 3391; GFX10-DL: ; %bb.0: ; %entry 3392; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 3393; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 3394; GFX10-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 3395; GFX10-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 3396; GFX10-DL-NEXT: s_mov_b32 s22, -1 3397; GFX10-DL-NEXT: s_mov_b32 s23, 0x31c16000 3398; GFX10-DL-NEXT: s_add_u32 s20, s20, s3 3399; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3400; GFX10-DL-NEXT: s_addc_u32 s21, s21, 0 3401; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 3402; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] 3403; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 3404; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 3405; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff 3406; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 3407; GFX10-DL-NEXT: s_lshr_b32 s9, s0, 4 3408; GFX10-DL-NEXT: s_lshr_b32 s16, s1, 4 3409; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s9 3410; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s16 3411; GFX10-DL-NEXT: s_lshr_b32 s10, s0, 12 3412; GFX10-DL-NEXT: s_lshr_b32 s17, s1, 12 3413; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s0 3414; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s1 3415; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s17 3416; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s10 3417; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v6 3418; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v12 3419; GFX10-DL-NEXT: s_lshr_b32 s11, s0, 8 3420; GFX10-DL-NEXT: s_lshr_b32 s18, s1, 8 3421; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s11 3422; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s18 3423; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2 3424; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 3425; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v5 3426; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, v6, v12 3427; GFX10-DL-NEXT: v_ashrrev_i16_e64 v13, 12, v13 3428; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 3429; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v11 3430; GFX10-DL-NEXT: v_mul_lo_u16_e64 v2, v2, v3 3431; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v6 3432; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v19, v13 3433; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 20 3434; GFX10-DL-NEXT: s_lshr_b32 s6, s0, 16 3435; GFX10-DL-NEXT: s_lshr_b32 s7, s0, 28 3436; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 24 3437; GFX10-DL-NEXT: s_lshr_b32 s12, s1, 20 3438; GFX10-DL-NEXT: v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3439; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s8 3440; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s7 3441; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s6 3442; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s3 3443; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s12 3444; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v11 3445; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 3446; GFX10-DL-NEXT: s_lshr_b32 s13, s1, 16 3447; GFX10-DL-NEXT: s_lshr_b32 s14, s1, 28 3448; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s13 3449; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v7 3450; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v8 3451; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v9 3452; GFX10-DL-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3453; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2 3454; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s14 3455; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v10 3456; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v12 3457; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 24 3458; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v6 3459; GFX10-DL-NEXT: v_ashrrev_i16_e64 v10, 12, v15 3460; GFX10-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s15 3461; GFX10-DL-NEXT: v_or_b32_e32 v3, v2, v3 3462; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v9 3463; GFX10-DL-NEXT: v_mul_lo_u16_e64 v15, v8, v6 3464; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v7, v10 3465; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v14 3466; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v3 3467; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 3468; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1 3469; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 8, v4 3470; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v5, v11 3471; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v7 3472; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v8 3473; GFX10-DL-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3474; GFX10-DL-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3475; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 3476; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2 3477; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 3478; GFX10-DL-NEXT: v_or_b32_e32 v3, v2, v4 3479; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 3480; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v3 3481; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 3482; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3483; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 3484; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] 3485; GFX10-DL-NEXT: s_endpgm 3486 <8 x i4> addrspace(1)* %src2, 3487 i8 addrspace(1)* nocapture %dst) { 3488entry: 3489 %idx = call i32 @llvm.amdgcn.workitem.id.x() 3490 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx 3491 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1 3492 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx 3493 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2 3494 3495 %cvec1 = sext <8 x i4> %vec1 to <8 x i8> 3496 %cvec2 = sext <8 x i4> %vec2 to <8 x i8> 3497 3498 %mul = mul <8 x i8> %cvec1, %cvec2 3499 %mul0 = extractelement <8 x i8> %mul, i64 0 3500 %mul1 = extractelement <8 x i8> %mul, i64 1 3501 %mul2 = extractelement <8 x i8> %mul, i64 2 3502 %mul3 = extractelement <8 x i8> %mul, i64 3 3503 %mul4 = extractelement <8 x i8> %mul, i64 4 3504 %mul5 = extractelement <8 x i8> %mul, i64 5 3505 %mul6 = extractelement <8 x i8> %mul, i64 6 3506 %mul7 = extractelement <8 x i8> %mul, i64 7 3507 3508 %acc = load i8, i8 addrspace(1)* %dst, align 4 3509 %add1 = add i8 %mul0, %acc 3510 %add2 = add i8 %add1, %mul1 3511 %add3 = add i8 %add2, %mul2 3512 %add4 = add i8 %add3, %mul3 3513 %add5 = add i8 %add4, %mul4 3514 %add6 = add i8 %add5, %mul5 3515 %add7 = add i8 %add6, %mul6 3516 %add8 = add i8 %add7, %mul7 3517 3518 store i8 %add8, i8 addrspace(1)* %dst, align 4 3519 ret void 3520} 3521 3522declare i32 @llvm.amdgcn.workitem.id.x() 3523