1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-XNACK %s 7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-XNACK %s 8; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s 9; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s 10 11define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1, 12; GFX7-LABEL: idot8_acc32: 13; GFX7: ; %bb.0: ; %entry 14; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 15; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 16; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 17; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 18; GFX7-NEXT: s_mov_b32 s14, -1 19; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 20; GFX7-NEXT: s_add_u32 s12, s12, s3 21; GFX7-NEXT: s_mov_b32 s3, 0xf000 22; GFX7-NEXT: s_mov_b32 s10, 0 23; GFX7-NEXT: s_mov_b32 s11, s3 24; GFX7-NEXT: s_waitcnt lgkmcnt(0) 25; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 26; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 27; GFX7-NEXT: v_mov_b32_e32 v1, 0 28; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 29; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 30; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 31; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 32; GFX7-NEXT: s_mov_b32 s2, -1 33; GFX7-NEXT: s_addc_u32 s13, s13, 0 34; GFX7-NEXT: s_waitcnt vmcnt(1) 35; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4 36; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4 37; GFX7-NEXT: s_waitcnt vmcnt(0) 38; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4 39; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4 40; GFX7-NEXT: s_waitcnt lgkmcnt(0) 41; GFX7-NEXT: v_mad_i32_i24 v1, v1, v9, s4 42; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4 43; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4 44; GFX7-NEXT: v_mad_i32_i24 v1, v3, v10, v1 45; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4 46; GFX7-NEXT: v_bfe_i32 v12, v0, 12, 4 47; GFX7-NEXT: v_mad_i32_i24 v1, v4, v11, v1 48; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4 49; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4 50; GFX7-NEXT: v_mad_i32_i24 v1, v5, v12, v1 51; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4 52; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4 53; GFX7-NEXT: v_mad_i32_i24 v1, v6, v13, v1 54; GFX7-NEXT: v_bfe_i32 v8, v2, 24, 4 55; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4 56; GFX7-NEXT: v_mad_i32_i24 v1, v7, v14, v1 57; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 58; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 59; GFX7-NEXT: v_mad_i32_i24 v1, v8, v15, v1 60; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1 61; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 62; GFX7-NEXT: s_endpgm 63; 64; GFX8-LABEL: idot8_acc32: 65; GFX8: ; %bb.0: ; %entry 66; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 67; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 68; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 69; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 70; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 71; GFX8-NEXT: s_waitcnt lgkmcnt(0) 72; GFX8-NEXT: v_mov_b32_e32 v1, s5 73; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 74; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 75; GFX8-NEXT: flat_load_dword v3, v[0:1] 76; GFX8-NEXT: v_mov_b32_e32 v1, s7 77; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 78; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 79; GFX8-NEXT: flat_load_dword v0, v[0:1] 80; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 81; GFX8-NEXT: s_mov_b32 s10, -1 82; GFX8-NEXT: s_mov_b32 s11, 0xe80000 83; GFX8-NEXT: s_add_u32 s8, s8, s3 84; GFX8-NEXT: s_addc_u32 s9, s9, 0 85; GFX8-NEXT: s_waitcnt vmcnt(1) 86; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4 87; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4 88; GFX8-NEXT: v_bfe_i32 v6, v3, 8, 4 89; GFX8-NEXT: v_bfe_i32 v8, v3, 12, 4 90; GFX8-NEXT: v_bfe_i32 v10, v3, 16, 4 91; GFX8-NEXT: v_bfe_i32 v12, v3, 20, 4 92; GFX8-NEXT: s_waitcnt vmcnt(0) 93; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 4 94; GFX8-NEXT: v_bfe_i32 v5, v0, 4, 4 95; GFX8-NEXT: s_waitcnt lgkmcnt(0) 96; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2 97; GFX8-NEXT: v_bfe_i32 v7, v0, 8, 4 98; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1 99; GFX8-NEXT: v_bfe_i32 v9, v0, 12, 4 100; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1 101; GFX8-NEXT: v_bfe_i32 v11, v0, 16, 4 102; GFX8-NEXT: v_mad_i32_i24 v1, v8, v9, v1 103; GFX8-NEXT: v_bfe_i32 v13, v0, 20, 4 104; GFX8-NEXT: v_mad_i32_i24 v1, v10, v11, v1 105; GFX8-NEXT: v_bfe_i32 v14, v3, 24, 4 106; GFX8-NEXT: v_bfe_i32 v15, v0, 24, 4 107; GFX8-NEXT: v_mad_i32_i24 v1, v12, v13, v1 108; GFX8-NEXT: v_ashrrev_i32_e32 v3, 28, v3 109; GFX8-NEXT: v_ashrrev_i32_e32 v0, 28, v0 110; GFX8-NEXT: v_mad_i32_i24 v1, v14, v15, v1 111; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 112; GFX8-NEXT: v_mov_b32_e32 v0, s0 113; GFX8-NEXT: v_mov_b32_e32 v1, s1 114; GFX8-NEXT: flat_store_dword v[0:1], v2 115; GFX8-NEXT: s_endpgm 116; 117; GFX9-LABEL: idot8_acc32: 118; GFX9: ; %bb.0: ; %entry 119; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 120; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 121; GFX9-NEXT: s_mov_b32 s10, -1 122; GFX9-NEXT: s_mov_b32 s11, 0xe00000 123; GFX9-NEXT: s_add_u32 s8, s8, s3 124; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 125; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 126; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 127; GFX9-NEXT: s_addc_u32 s9, s9, 0 128; GFX9-NEXT: s_waitcnt lgkmcnt(0) 129; GFX9-NEXT: global_load_dword v1, v0, s[4:5] 130; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 131; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 132; GFX9-NEXT: v_mov_b32_e32 v0, 0 133; GFX9-NEXT: s_waitcnt vmcnt(1) 134; GFX9-NEXT: v_bfe_i32 v3, v1, 0, 4 135; GFX9-NEXT: s_waitcnt vmcnt(0) 136; GFX9-NEXT: v_bfe_i32 v4, v2, 0, 4 137; GFX9-NEXT: v_bfe_i32 v5, v1, 4, 4 138; GFX9-NEXT: v_bfe_i32 v6, v2, 4, 4 139; GFX9-NEXT: v_bfe_i32 v7, v1, 8, 4 140; GFX9-NEXT: v_bfe_i32 v8, v2, 8, 4 141; GFX9-NEXT: v_bfe_i32 v9, v1, 12, 4 142; GFX9-NEXT: v_bfe_i32 v10, v2, 12, 4 143; GFX9-NEXT: v_bfe_i32 v11, v1, 16, 4 144; GFX9-NEXT: v_bfe_i32 v12, v2, 16, 4 145; GFX9-NEXT: v_bfe_i32 v13, v1, 20, 4 146; GFX9-NEXT: v_bfe_i32 v14, v2, 20, 4 147; GFX9-NEXT: v_bfe_i32 v15, v1, 24, 4 148; GFX9-NEXT: v_bfe_i32 v16, v2, 24, 4 149; GFX9-NEXT: v_ashrrev_i32_e32 v1, 28, v1 150; GFX9-NEXT: v_ashrrev_i32_e32 v2, 28, v2 151; GFX9-NEXT: v_mul_i32_i24_e32 v3, v3, v4 152; GFX9-NEXT: v_mul_i32_i24_e32 v4, v5, v6 153; GFX9-NEXT: v_mul_i32_i24_e32 v5, v7, v8 154; GFX9-NEXT: v_mul_i32_i24_e32 v6, v9, v10 155; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2 156; GFX9-NEXT: s_waitcnt lgkmcnt(0) 157; GFX9-NEXT: v_add3_u32 v2, v3, s0, v4 158; GFX9-NEXT: v_mul_i32_i24_e32 v7, v11, v12 159; GFX9-NEXT: v_mul_i32_i24_e32 v8, v13, v14 160; GFX9-NEXT: v_add3_u32 v2, v2, v5, v6 161; GFX9-NEXT: v_mul_i32_i24_e32 v9, v15, v16 162; GFX9-NEXT: v_add3_u32 v2, v2, v7, v8 163; GFX9-NEXT: v_add3_u32 v1, v2, v9, v1 164; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 165; GFX9-NEXT: s_endpgm 166; 167; GFX9-DL-LABEL: idot8_acc32: 168; GFX9-DL: ; %bb.0: ; %entry 169; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 170; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 171; GFX9-DL-NEXT: s_mov_b32 s10, -1 172; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 173; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 174; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 175; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 176; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 177; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 178; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 179; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 180; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 181; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 182; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 183; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 184; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0 185; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 186; GFX9-DL-NEXT: s_endpgm 187; 188; GFX10-DL-XNACK-LABEL: idot8_acc32: 189; GFX10-DL-XNACK: ; %bb.0: ; %entry 190; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 191; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 192; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 193; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 194; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 195; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 196; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 197; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 198; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 199; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) 200; GFX10-DL-XNACK-NEXT: s_clause 0x1 201; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] 202; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] 203; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 204; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0 205; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 206; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s2 207; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[0:1] 208; GFX10-DL-XNACK-NEXT: s_endpgm 209; 210; GFX10-DL-NOXNACK-LABEL: idot8_acc32: 211; GFX10-DL-NOXNACK: ; %bb.0: ; %entry 212; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 213; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 214; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 215; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 216; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 217; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 218; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 219; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 220; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 221; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 222; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) 223; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 224; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] 225; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] 226; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0 227; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 228; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s2 229; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1] 230; GFX10-DL-NOXNACK-NEXT: s_endpgm 231; GFX10-DL-LABEL: idot8_acc32: 232; GFX10-DL: ; %bb.0: ; %entry 233; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 234; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 235; GFX10-DL-NEXT: s_mov_b32 s10, -1 236; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 237; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 238; GFX10-DL-NEXT: s_clause 0x1 239; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 240; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 241; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 242; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 243; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 244; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 245; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 246; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 247; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 248; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 249; GFX10-DL-NEXT: v_dot8_i32_i4 v0, s0, s1, v0 250; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 251; GFX10-DL-NEXT: s_endpgm 252 <8 x i4> addrspace(1)* %src2, 253 i32 addrspace(1)* nocapture %dst) { 254entry: 255 %idx = call i32 @llvm.amdgcn.workitem.id.x() 256 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx 257 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1 258 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx 259 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2 260 261 %v1e0 = extractelement <8 x i4> %vec1, i64 0 262 %cv1e0 = sext i4 %v1e0 to i32 263 %v2e0 = extractelement <8 x i4> %vec2, i64 0 264 %cv2e0 = sext i4 %v2e0 to i32 265 %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0 266 267 %v1e1 = extractelement <8 x i4> %vec1, i64 1 268 %cv1e1 = sext i4 %v1e1 to i32 269 %v2e1 = extractelement <8 x i4> %vec2, i64 1 270 %cv2e1 = sext i4 %v2e1 to i32 271 %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1 272 273 %v1e2 = extractelement <8 x i4> %vec1, i64 2 274 %cv1e2 = sext i4 %v1e2 to i32 275 %v2e2 = extractelement <8 x i4> %vec2, i64 2 276 %cv2e2 = sext i4 %v2e2 to i32 277 %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2 278 279 %v1e3 = extractelement <8 x i4> %vec1, i64 3 280 %cv1e3 = sext i4 %v1e3 to i32 281 %v2e3 = extractelement <8 x i4> %vec2, i64 3 282 %cv2e3 = sext i4 %v2e3 to i32 283 %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3 284 285 %v1e4 = extractelement <8 x i4> %vec1, i64 4 286 %cv1e4 = sext i4 %v1e4 to i32 287 %v2e4 = extractelement <8 x i4> %vec2, i64 4 288 %cv2e4 = sext i4 %v2e4 to i32 289 %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4 290 291 %v1e5 = extractelement <8 x i4> %vec1, i64 5 292 %cv1e5 = sext i4 %v1e5 to i32 293 %v2e5 = extractelement <8 x i4> %vec2, i64 5 294 %cv2e5 = sext i4 %v2e5 to i32 295 %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5 296 297 %v1e6 = extractelement <8 x i4> %vec1, i64 6 298 %cv1e6 = sext i4 %v1e6 to i32 299 %v2e6 = extractelement <8 x i4> %vec2, i64 6 300 %cv2e6 = sext i4 %v2e6 to i32 301 %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6 302 303 %v1e7 = extractelement <8 x i4> %vec1, i64 7 304 %cv1e7 = sext i4 %v1e7 to i32 305 %v2e7 = extractelement <8 x i4> %vec2, i64 7 306 %cv2e7 = sext i4 %v2e7 to i32 307 %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7 308 309 %acc = load i32, i32 addrspace(1)* %dst, align 4 310 %add1 = add i32 %mul0, %acc 311 %add2 = add i32 %add1, %mul1 312 %add3 = add i32 %add2, %mul2 313 %add4 = add i32 %add3, %mul3 314 %add5 = add i32 %add4, %mul4 315 %add6 = add i32 %add5, %mul5 316 %add7 = add i32 %add6, %mul6 317 %add8 = add i32 %add7, %mul7 318 319 store i32 %add8, i32 addrspace(1)* %dst, align 4 320 ret void 321} 322 323; TODO: Once the unnecessary zero extentions of the elements are removed; 324; pattern recognizer will kick in. 325define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, 326; GFX7-LABEL: idot8_acc16: 327; GFX7: ; %bb.0: ; %entry 328; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 329; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 330; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 331; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 332; GFX7-NEXT: s_mov_b32 s14, -1 333; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 334; GFX7-NEXT: s_add_u32 s12, s12, s3 335; GFX7-NEXT: s_mov_b32 s3, 0xf000 336; GFX7-NEXT: s_mov_b32 s10, 0 337; GFX7-NEXT: s_mov_b32 s11, s3 338; GFX7-NEXT: s_waitcnt lgkmcnt(0) 339; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 340; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 341; GFX7-NEXT: v_mov_b32_e32 v1, 0 342; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 343; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 344; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 345; GFX7-NEXT: s_mov_b32 s2, -1 346; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 347; GFX7-NEXT: s_addc_u32 s13, s13, 0 348; GFX7-NEXT: s_waitcnt vmcnt(2) 349; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 4 350; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4 351; GFX7-NEXT: s_waitcnt vmcnt(1) 352; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4 353; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 354; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4 355; GFX7-NEXT: v_and_b32_e32 v10, 0xffff, v10 356; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4 357; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 358; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4 359; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v11 360; GFX7-NEXT: s_waitcnt vmcnt(0) 361; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1 362; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4 363; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 364; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4 365; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v12 366; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 367; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4 368; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 369; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4 370; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v13 371; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 372; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4 373; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 374; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4 375; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v14 376; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 377; GFX7-NEXT: v_bfe_i32 v9, v2, 24, 4 378; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8 379; GFX7-NEXT: v_bfe_i32 v16, v0, 24, 4 380; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v15 381; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 382; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 383; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v9 384; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 385; GFX7-NEXT: v_and_b32_e32 v16, 0xffff, v16 386; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 387; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 388; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 389; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1 390; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 391; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 392; GFX7-NEXT: s_endpgm 393; 394; GFX8-LABEL: idot8_acc16: 395; GFX8: ; %bb.0: ; %entry 396; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 397; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 398; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 399; GFX8-NEXT: v_mov_b32_e32 v5, 12 400; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 401; GFX8-NEXT: s_waitcnt lgkmcnt(0) 402; GFX8-NEXT: v_mov_b32_e32 v1, s5 403; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 404; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 405; GFX8-NEXT: flat_load_dword v3, v[0:1] 406; GFX8-NEXT: v_mov_b32_e32 v1, s7 407; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 408; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 409; GFX8-NEXT: flat_load_dword v2, v[0:1] 410; GFX8-NEXT: v_mov_b32_e32 v0, s0 411; GFX8-NEXT: v_mov_b32_e32 v1, s1 412; GFX8-NEXT: flat_load_ushort v4, v[0:1] 413; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 414; GFX8-NEXT: s_mov_b32 s10, -1 415; GFX8-NEXT: s_mov_b32 s11, 0xe80000 416; GFX8-NEXT: s_add_u32 s8, s8, s3 417; GFX8-NEXT: s_addc_u32 s9, s9, 0 418; GFX8-NEXT: s_waitcnt vmcnt(2) 419; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 420; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3 421; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 422; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v3 423; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3 424; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 425; GFX8-NEXT: s_waitcnt vmcnt(1) 426; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2 427; GFX8-NEXT: v_lshlrev_b16_e32 v17, 12, v2 428; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v2 429; GFX8-NEXT: v_lshrrev_b32_e32 v12, 20, v2 430; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2 431; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2 432; GFX8-NEXT: v_lshlrev_b16_sdwa v18, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 433; GFX8-NEXT: v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 434; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 435; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 436; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v16 437; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v17 438; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10 439; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15 440; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9 441; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14 442; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 443; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15 444; GFX8-NEXT: s_waitcnt vmcnt(0) 445; GFX8-NEXT: v_mad_u16 v4, v5, v16, v4 446; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8 447; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13 448; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 449; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 450; GFX8-NEXT: v_mad_u16 v4, v10, v15, v4 451; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 452; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 453; GFX8-NEXT: v_mad_u16 v4, v9, v14, v4 454; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v18 455; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v19 456; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7 457; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 458; GFX8-NEXT: v_mad_u16 v4, v8, v13, v4 459; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 460; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 461; GFX8-NEXT: v_mad_u16 v4, v17, v18, v4 462; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 463; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2 464; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6 465; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11 466; GFX8-NEXT: v_mad_u16 v4, v7, v12, v4 467; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 468; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 469; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 470; GFX8-NEXT: v_mad_u16 v2, v6, v11, v2 471; GFX8-NEXT: flat_store_short v[0:1], v2 472; GFX8-NEXT: s_endpgm 473; 474; GFX9-LABEL: idot8_acc16: 475; GFX9: ; %bb.0: ; %entry 476; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 477; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 478; GFX9-NEXT: s_mov_b32 s10, -1 479; GFX9-NEXT: s_mov_b32 s11, 0xe00000 480; GFX9-NEXT: s_add_u32 s8, s8, s3 481; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 482; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 483; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 484; GFX9-NEXT: v_mov_b32_e32 v4, 12 485; GFX9-NEXT: s_waitcnt lgkmcnt(0) 486; GFX9-NEXT: global_load_dword v1, v0, s[4:5] 487; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 488; GFX9-NEXT: v_mov_b32_e32 v0, 0 489; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] 490; GFX9-NEXT: s_addc_u32 s9, s9, 0 491; GFX9-NEXT: s_waitcnt vmcnt(2) 492; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 493; GFX9-NEXT: s_waitcnt vmcnt(1) 494; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2 495; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v1 496; GFX9-NEXT: v_lshlrev_b16_e32 v16, 12, v2 497; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1 498; GFX9-NEXT: v_lshrrev_b32_e32 v6, 20, v1 499; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1 500; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 501; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2 502; GFX9-NEXT: v_lshrrev_b32_e32 v11, 20, v2 503; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2 504; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 505; GFX9-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 506; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 507; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 508; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 509; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v15 510; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v16 511; GFX9-NEXT: v_lshlrev_b16_e32 v9, 12, v9 512; GFX9-NEXT: v_lshlrev_b16_e32 v14, 12, v14 513; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8 514; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13 515; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9 516; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14 517; GFX9-NEXT: s_waitcnt vmcnt(0) 518; GFX9-NEXT: v_mad_legacy_u16 v3, v4, v15, v3 519; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7 520; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12 521; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 522; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13 523; GFX9-NEXT: v_mad_legacy_u16 v3, v9, v14, v3 524; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 525; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 526; GFX9-NEXT: v_mad_legacy_u16 v3, v8, v13, v3 527; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v17 528; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v18 529; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6 530; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11 531; GFX9-NEXT: v_mad_legacy_u16 v3, v7, v12, v3 532; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 533; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 534; GFX9-NEXT: v_mad_legacy_u16 v3, v16, v17, v3 535; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1 536; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 537; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v5 538; GFX9-NEXT: v_lshlrev_b16_e32 v10, 12, v10 539; GFX9-NEXT: v_mad_legacy_u16 v3, v6, v11, v3 540; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 541; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 542; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 543; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 544; GFX9-NEXT: global_store_short v0, v1, s[2:3] 545; GFX9-NEXT: s_endpgm 546; 547; GFX9-DL-LABEL: idot8_acc16: 548; GFX9-DL: ; %bb.0: ; %entry 549; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 550; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 551; GFX9-DL-NEXT: s_mov_b32 s10, -1 552; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 553; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 554; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 555; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 556; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 557; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 558; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 559; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 560; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 561; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 562; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] 563; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 564; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 565; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 566; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 567; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2 568; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v1 569; GFX9-DL-NEXT: v_lshlrev_b16_e32 v16, 12, v2 570; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1 571; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1 572; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1 573; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 574; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2 575; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2 576; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2 577; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 578; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 579; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 580; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 581; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 582; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v15 583; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v16 584; GFX9-DL-NEXT: v_lshlrev_b16_e32 v9, 12, v9 585; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v14 586; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8 587; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13 588; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9 589; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14 590; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 591; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v15, v3 592; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7 593; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12 594; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 595; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13 596; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v9, v14, v3 597; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 598; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 599; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v13, v3 600; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v17 601; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18 602; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6 603; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11 604; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v12, v3 605; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 606; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 607; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v16, v17, v3 608; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1 609; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 610; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v5 611; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v10 612; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v11, v3 613; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 614; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 615; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 616; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 617; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] 618; GFX9-DL-NEXT: s_endpgm 619; 620; GFX10-DL-XNACK-LABEL: idot8_acc16: 621; GFX10-DL-XNACK: ; %bb.0: ; %entry 622; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 623; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 624; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 625; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 626; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 627; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 628; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 629; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 630; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 631; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) 632; GFX10-DL-XNACK-NEXT: s_clause 0x1 633; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] 634; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] 635; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 636; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1] 637; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) 638; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1 639; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1 640; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 20, v1 641; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v1 642; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 643; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 644; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 645; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 646; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) 647; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v2 648; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v2 649; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v2 650; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v2 651; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v2 652; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v2 653; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2 654; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v2 655; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 656; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 657; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 658; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v17, 12, v17 659; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 660; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 661; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 662; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16 663; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) 664; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v1, v17, v3 665; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v9 666; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 667; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 668; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v15 669; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v10, v16, v1 670; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 671; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v14 672; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 673; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 674; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v3, v2, v1 675; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v7 676; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v10 677; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 678; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v13 679; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v9, v1 680; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 681; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v12 682; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 683; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 684; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1 685; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v5 686; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8 687; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4 688; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v11 689; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v6, v7, v1 690; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4 691; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 692; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1 693; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v5, v1 694; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1] 695; GFX10-DL-XNACK-NEXT: s_endpgm 696; 697; GFX10-DL-NOXNACK-LABEL: idot8_acc16: 698; GFX10-DL-NOXNACK: ; %bb.0: ; %entry 699; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 700; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 701; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 702; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 703; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 704; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 705; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 706; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 707; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 708; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 709; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 710; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) 711; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 712; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] 713; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] 714; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1] 715; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) 716; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1 717; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1 718; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 20, v1 719; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v1 720; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 721; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 722; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 723; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 724; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) 725; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v0 726; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v0 727; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v0 728; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v0 729; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v0 730; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v0 731; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0 732; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 8, v0 733; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 734; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 735; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 736; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v17, 12, v17 737; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 738; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 739; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 740; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16 741; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) 742; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v1, v17, v3 743; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v9 744; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 745; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 746; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v15 747; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v10, v16, v1 748; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 749; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v14 750; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 751; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 752; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v3, v0, v1 753; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v7 754; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v10 755; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 756; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v13 757; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v8, v9, v0 758; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 759; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v12 760; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 761; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 762; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0 763; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v5 764; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v8 765; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 766; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v11 767; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v6, v7, v0 768; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4 769; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 770; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0 771; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v4, v5, v0 772; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1] 773; GFX10-DL-NOXNACK-NEXT: s_endpgm 774; GFX10-DL-LABEL: idot8_acc16: 775; GFX10-DL: ; %bb.0: ; %entry 776; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 777; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 778; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 779; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 780; GFX10-DL-NEXT: s_mov_b32 s14, -1 781; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 782; GFX10-DL-NEXT: s_add_u32 s12, s12, s3 783; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 784; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 785; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 786; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] 787; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 788; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 789; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 790; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12 791; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 12 792; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40000 793; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x40000 794; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s2 795; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s3 796; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40004 797; GFX10-DL-NEXT: s_bfe_i32 s9, s0, 0x40008 798; GFX10-DL-NEXT: s_bfe_i32 s10, s1, 0x40008 799; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004 800; GFX10-DL-NEXT: v_mul_i32_i24_e64 v4, s9, s10 801; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2 802; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 803; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010 804; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 805; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1 806; GFX10-DL-NEXT: v_mad_i32_i24 v1, s8, s2, v1 807; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff 808; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2 809; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 810; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 811; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 812; GFX10-DL-NEXT: v_mad_u32_u24 v1, v2, v3, v1 813; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 814; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40014 815; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40014 816; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 817; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 818; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018 819; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 820; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 821; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 822; GFX10-DL-NEXT: v_mad_i32_i24 v1, s0, s1, v1 823; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] 824; GFX10-DL-NEXT: s_endpgm 825 <8 x i4> addrspace(1)* %src2, 826 i16 addrspace(1)* nocapture %dst) { 827entry: 828 %idx = call i32 @llvm.amdgcn.workitem.id.x() 829 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx 830 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1 831 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx 832 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2 833 834 %v1e0 = extractelement <8 x i4> %vec1, i64 0 835 %cv1e0 = sext i4 %v1e0 to i16 836 %v2e0 = extractelement <8 x i4> %vec2, i64 0 837 %cv2e0 = sext i4 %v2e0 to i16 838 %mul0 = mul nuw nsw i16 %cv1e0, %cv2e0 839 840 %v1e1 = extractelement <8 x i4> %vec1, i64 1 841 %cv1e1 = sext i4 %v1e1 to i16 842 %v2e1 = extractelement <8 x i4> %vec2, i64 1 843 %cv2e1 = sext i4 %v2e1 to i16 844 %mul1 = mul nuw nsw i16 %cv1e1, %cv2e1 845 846 %v1e2 = extractelement <8 x i4> %vec1, i64 2 847 %cv1e2 = sext i4 %v1e2 to i16 848 %v2e2 = extractelement <8 x i4> %vec2, i64 2 849 %cv2e2 = sext i4 %v2e2 to i16 850 %mul2 = mul nuw nsw i16 %cv1e2, %cv2e2 851 852 %v1e3 = extractelement <8 x i4> %vec1, i64 3 853 %cv1e3 = sext i4 %v1e3 to i16 854 %v2e3 = extractelement <8 x i4> %vec2, i64 3 855 %cv2e3 = sext i4 %v2e3 to i16 856 %mul3 = mul nuw nsw i16 %cv1e3, %cv2e3 857 858 %v1e4 = extractelement <8 x i4> %vec1, i64 4 859 %cv1e4 = sext i4 %v1e4 to i16 860 %v2e4 = extractelement <8 x i4> %vec2, i64 4 861 %cv2e4 = sext i4 %v2e4 to i16 862 %mul4 = mul nuw nsw i16 %cv1e4, %cv2e4 863 864 %v1e5 = extractelement <8 x i4> %vec1, i64 5 865 %cv1e5 = sext i4 %v1e5 to i16 866 %v2e5 = extractelement <8 x i4> %vec2, i64 5 867 %cv2e5 = sext i4 %v2e5 to i16 868 %mul5 = mul nuw nsw i16 %cv1e5, %cv2e5 869 870 %v1e6 = extractelement <8 x i4> %vec1, i64 6 871 %cv1e6 = sext i4 %v1e6 to i16 872 %v2e6 = extractelement <8 x i4> %vec2, i64 6 873 %cv2e6 = sext i4 %v2e6 to i16 874 %mul6 = mul nuw nsw i16 %cv1e6, %cv2e6 875 876 %v1e7 = extractelement <8 x i4> %vec1, i64 7 877 %cv1e7 = sext i4 %v1e7 to i16 878 %v2e7 = extractelement <8 x i4> %vec2, i64 7 879 %cv2e7 = sext i4 %v2e7 to i16 880 %mul7 = mul nuw nsw i16 %cv1e7, %cv2e7 881 882 %acc = load i16, i16 addrspace(1)* %dst, align 4 883 %add1 = add i16 %mul0, %acc 884 %add2 = add i16 %add1, %mul1 885 %add3 = add i16 %add2, %mul2 886 %add4 = add i16 %add3, %mul3 887 %add5 = add i16 %add4, %mul4 888 %add6 = add i16 %add5, %mul5 889 %add7 = add i16 %add6, %mul6 890 %add8 = add i16 %add7, %mul7 891 892 store i16 %add8, i16 addrspace(1)* %dst, align 4 893 ret void 894} 895 896; TODO: Support this pattern. 897define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, 898; GFX7-LABEL: idot8_acc8: 899; GFX7: ; %bb.0: ; %entry 900; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 901; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 902; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 903; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 904; GFX7-NEXT: s_mov_b32 s14, -1 905; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 906; GFX7-NEXT: s_add_u32 s12, s12, s3 907; GFX7-NEXT: s_mov_b32 s3, 0xf000 908; GFX7-NEXT: s_mov_b32 s10, 0 909; GFX7-NEXT: s_mov_b32 s11, s3 910; GFX7-NEXT: s_waitcnt lgkmcnt(0) 911; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 912; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 913; GFX7-NEXT: v_mov_b32_e32 v1, 0 914; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 915; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 916; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 917; GFX7-NEXT: s_mov_b32 s2, -1 918; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 919; GFX7-NEXT: s_addc_u32 s13, s13, 0 920; GFX7-NEXT: s_waitcnt vmcnt(2) 921; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 4 922; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4 923; GFX7-NEXT: s_waitcnt vmcnt(1) 924; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4 925; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3 926; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4 927; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v10 928; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4 929; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4 930; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4 931; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11 932; GFX7-NEXT: s_waitcnt vmcnt(0) 933; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1 934; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4 935; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v5 936; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4 937; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v12 938; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 939; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4 940; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6 941; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4 942; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v13 943; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 944; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4 945; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v7 946; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4 947; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v14 948; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 949; GFX7-NEXT: v_bfe_i32 v9, v2, 24, 4 950; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8 951; GFX7-NEXT: v_bfe_i32 v16, v0, 24, 4 952; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v15 953; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 954; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 955; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v9 956; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 957; GFX7-NEXT: v_and_b32_e32 v16, 0xff, v16 958; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 959; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 960; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 961; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1 962; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 963; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 964; GFX7-NEXT: s_endpgm 965; 966; GFX8-LABEL: idot8_acc8: 967; GFX8: ; %bb.0: ; %entry 968; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 969; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 970; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 971; GFX8-NEXT: v_mov_b32_e32 v5, 12 972; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 973; GFX8-NEXT: s_waitcnt lgkmcnt(0) 974; GFX8-NEXT: v_mov_b32_e32 v1, s5 975; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 976; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 977; GFX8-NEXT: flat_load_dword v3, v[0:1] 978; GFX8-NEXT: v_mov_b32_e32 v1, s7 979; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 980; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 981; GFX8-NEXT: flat_load_dword v2, v[0:1] 982; GFX8-NEXT: v_mov_b32_e32 v0, s0 983; GFX8-NEXT: v_mov_b32_e32 v1, s1 984; GFX8-NEXT: flat_load_ubyte v4, v[0:1] 985; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 986; GFX8-NEXT: s_mov_b32 s10, -1 987; GFX8-NEXT: s_mov_b32 s11, 0xe80000 988; GFX8-NEXT: s_add_u32 s8, s8, s3 989; GFX8-NEXT: s_addc_u32 s9, s9, 0 990; GFX8-NEXT: s_waitcnt vmcnt(2) 991; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 992; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3 993; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 994; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v3 995; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3 996; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 997; GFX8-NEXT: s_waitcnt vmcnt(1) 998; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2 999; GFX8-NEXT: v_lshlrev_b16_e32 v17, 12, v2 1000; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v2 1001; GFX8-NEXT: v_lshrrev_b32_e32 v12, 20, v2 1002; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2 1003; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2 1004; GFX8-NEXT: v_lshlrev_b16_sdwa v18, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1005; GFX8-NEXT: v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1006; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 1007; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 1008; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v16 1009; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v17 1010; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10 1011; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15 1012; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9 1013; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14 1014; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 1015; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15 1016; GFX8-NEXT: s_waitcnt vmcnt(0) 1017; GFX8-NEXT: v_mad_u16 v4, v5, v16, v4 1018; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8 1019; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13 1020; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 1021; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 1022; GFX8-NEXT: v_mad_u16 v4, v10, v15, v4 1023; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 1024; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 1025; GFX8-NEXT: v_mad_u16 v4, v9, v14, v4 1026; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v18 1027; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v19 1028; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7 1029; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 1030; GFX8-NEXT: v_mad_u16 v4, v8, v13, v4 1031; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 1032; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 1033; GFX8-NEXT: v_mad_u16 v4, v17, v18, v4 1034; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 1035; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2 1036; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6 1037; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11 1038; GFX8-NEXT: v_mad_u16 v4, v7, v12, v4 1039; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 1040; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 1041; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 1042; GFX8-NEXT: v_mad_u16 v2, v6, v11, v2 1043; GFX8-NEXT: flat_store_byte v[0:1], v2 1044; GFX8-NEXT: s_endpgm 1045; 1046; GFX9-LABEL: idot8_acc8: 1047; GFX9: ; %bb.0: ; %entry 1048; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1049; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1050; GFX9-NEXT: s_mov_b32 s10, -1 1051; GFX9-NEXT: s_mov_b32 s11, 0xe00000 1052; GFX9-NEXT: s_add_u32 s8, s8, s3 1053; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1054; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1055; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1056; GFX9-NEXT: v_mov_b32_e32 v4, 12 1057; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1058; GFX9-NEXT: global_load_dword v1, v0, s[4:5] 1059; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 1060; GFX9-NEXT: v_mov_b32_e32 v0, 0 1061; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] 1062; GFX9-NEXT: s_addc_u32 s9, s9, 0 1063; GFX9-NEXT: s_waitcnt vmcnt(2) 1064; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 1065; GFX9-NEXT: s_waitcnt vmcnt(1) 1066; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2 1067; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v1 1068; GFX9-NEXT: v_lshlrev_b16_e32 v16, 12, v2 1069; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1 1070; GFX9-NEXT: v_lshrrev_b32_e32 v6, 20, v1 1071; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1 1072; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 1073; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2 1074; GFX9-NEXT: v_lshrrev_b32_e32 v11, 20, v2 1075; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2 1076; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 1077; GFX9-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1078; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1079; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 1080; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 1081; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v15 1082; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v16 1083; GFX9-NEXT: v_lshlrev_b16_e32 v9, 12, v9 1084; GFX9-NEXT: v_lshlrev_b16_e32 v14, 12, v14 1085; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8 1086; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13 1087; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9 1088; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14 1089; GFX9-NEXT: s_waitcnt vmcnt(0) 1090; GFX9-NEXT: v_mad_legacy_u16 v3, v4, v15, v3 1091; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7 1092; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12 1093; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 1094; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13 1095; GFX9-NEXT: v_mad_legacy_u16 v3, v9, v14, v3 1096; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 1097; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 1098; GFX9-NEXT: v_mad_legacy_u16 v3, v8, v13, v3 1099; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v17 1100; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v18 1101; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6 1102; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11 1103; GFX9-NEXT: v_mad_legacy_u16 v3, v7, v12, v3 1104; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 1105; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 1106; GFX9-NEXT: v_mad_legacy_u16 v3, v16, v17, v3 1107; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1 1108; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 1109; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v5 1110; GFX9-NEXT: v_lshlrev_b16_e32 v10, 12, v10 1111; GFX9-NEXT: v_mad_legacy_u16 v3, v6, v11, v3 1112; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 1113; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 1114; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 1115; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 1116; GFX9-NEXT: global_store_byte v0, v1, s[2:3] 1117; GFX9-NEXT: s_endpgm 1118; 1119; GFX9-DL-LABEL: idot8_acc8: 1120; GFX9-DL: ; %bb.0: ; %entry 1121; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1122; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1123; GFX9-DL-NEXT: s_mov_b32 s10, -1 1124; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 1125; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 1126; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1127; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1128; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1129; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 1130; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1131; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 1132; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 1133; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1134; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] 1135; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 1136; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 1137; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 1138; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 1139; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2 1140; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v1 1141; GFX9-DL-NEXT: v_lshlrev_b16_e32 v16, 12, v2 1142; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1 1143; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1 1144; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1 1145; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 1146; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2 1147; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2 1148; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2 1149; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 1150; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1151; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1152; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 1153; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 1154; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v15 1155; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v16 1156; GFX9-DL-NEXT: v_lshlrev_b16_e32 v9, 12, v9 1157; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v14 1158; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8 1159; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13 1160; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9 1161; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14 1162; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1163; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v15, v3 1164; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7 1165; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12 1166; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 1167; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13 1168; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v9, v14, v3 1169; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 1170; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 1171; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v13, v3 1172; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v17 1173; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18 1174; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6 1175; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11 1176; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v12, v3 1177; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 1178; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 1179; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v16, v17, v3 1180; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1 1181; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 1182; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v5 1183; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v10 1184; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v11, v3 1185; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 1186; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 1187; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 1188; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 1189; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] 1190; GFX9-DL-NEXT: s_endpgm 1191; 1192; GFX10-DL-XNACK-LABEL: idot8_acc8: 1193; GFX10-DL-XNACK: ; %bb.0: ; %entry 1194; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1195; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1196; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1197; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1198; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1199; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 1200; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 1201; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 1202; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 1203; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) 1204; GFX10-DL-XNACK-NEXT: s_clause 0x1 1205; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] 1206; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] 1207; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 1208; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v0, s[0:1] 1209; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) 1210; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1 1211; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1 1212; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 20, v1 1213; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v1 1214; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 1215; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 1216; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 1217; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 1218; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) 1219; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v2 1220; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v2 1221; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v2 1222; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v2 1223; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v2 1224; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v2 1225; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2 1226; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v2 1227; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 1228; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 1229; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 1230; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v17, 12, v17 1231; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 1232; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 1233; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 1234; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16 1235; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) 1236; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v1, v17, v3 1237; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v9 1238; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 1239; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 1240; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v15 1241; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v10, v16, v1 1242; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 1243; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v14 1244; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 1245; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 1246; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v3, v2, v1 1247; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v7 1248; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v10 1249; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 1250; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v13 1251; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v9, v1 1252; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 1253; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v12 1254; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 1255; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 1256; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1 1257; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v5 1258; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8 1259; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4 1260; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v11 1261; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v6, v7, v1 1262; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4 1263; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 1264; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1 1265; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v5, v1 1266; GFX10-DL-XNACK-NEXT: global_store_byte v0, v1, s[0:1] 1267; GFX10-DL-XNACK-NEXT: s_endpgm 1268; 1269; GFX10-DL-NOXNACK-LABEL: idot8_acc8: 1270; GFX10-DL-NOXNACK: ; %bb.0: ; %entry 1271; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 1272; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1273; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1274; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1275; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 1276; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1277; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1278; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 1279; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 1280; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 1281; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 1282; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) 1283; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 1284; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] 1285; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] 1286; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v3, v2, s[0:1] 1287; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) 1288; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1 1289; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1 1290; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 20, v1 1291; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v1 1292; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 1293; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 1294; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 1295; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 1296; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) 1297; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v0 1298; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v0 1299; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v0 1300; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v0 1301; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v0 1302; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v0 1303; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0 1304; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 8, v0 1305; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 1306; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 1307; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 1308; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v17, 12, v17 1309; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 1310; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 1311; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 1312; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16 1313; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) 1314; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v1, v17, v3 1315; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v9 1316; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 1317; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 1318; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v15 1319; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v10, v16, v1 1320; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 1321; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v14 1322; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 1323; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 1324; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v3, v0, v1 1325; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v7 1326; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v10 1327; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 1328; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v13 1329; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v8, v9, v0 1330; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 1331; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v12 1332; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 1333; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 1334; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0 1335; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v5 1336; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v8 1337; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 1338; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v11 1339; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v6, v7, v0 1340; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4 1341; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 1342; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0 1343; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v4, v5, v0 1344; GFX10-DL-NOXNACK-NEXT: global_store_byte v2, v0, s[0:1] 1345; GFX10-DL-NOXNACK-NEXT: s_endpgm 1346; GFX10-DL-LABEL: idot8_acc8: 1347; GFX10-DL: ; %bb.0: ; %entry 1348; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 1349; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 1350; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1351; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1352; GFX10-DL-NEXT: s_mov_b32 s14, -1 1353; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 1354; GFX10-DL-NEXT: s_add_u32 s12, s12, s3 1355; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1356; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 1357; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1358; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] 1359; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 1360; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 1361; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1362; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12 1363; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 12 1364; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40000 1365; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x40000 1366; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s2 1367; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s3 1368; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40004 1369; GFX10-DL-NEXT: s_bfe_i32 s9, s0, 0x40008 1370; GFX10-DL-NEXT: s_bfe_i32 s10, s1, 0x40008 1371; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004 1372; GFX10-DL-NEXT: v_mul_i32_i24_e64 v4, s9, s10 1373; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2 1374; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 1375; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010 1376; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1377; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1 1378; GFX10-DL-NEXT: v_mad_i32_i24 v1, s8, s2, v1 1379; GFX10-DL-NEXT: s_movk_i32 s2, 0xff 1380; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2 1381; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 1382; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 1383; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 1384; GFX10-DL-NEXT: v_mad_u32_u24 v1, v2, v3, v1 1385; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 1386; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40014 1387; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40014 1388; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 1389; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 1390; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018 1391; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 1392; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 1393; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 1394; GFX10-DL-NEXT: v_mad_i32_i24 v1, s0, s1, v1 1395; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] 1396; GFX10-DL-NEXT: s_endpgm 1397 <8 x i4> addrspace(1)* %src2, 1398 i8 addrspace(1)* nocapture %dst) { 1399entry: 1400 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1401 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx 1402 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1 1403 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx 1404 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2 1405 1406 %v1e0 = extractelement <8 x i4> %vec1, i64 0 1407 %cv1e0 = sext i4 %v1e0 to i8 1408 %v2e0 = extractelement <8 x i4> %vec2, i64 0 1409 %cv2e0 = sext i4 %v2e0 to i8 1410 %mul0 = mul nuw nsw i8 %cv1e0, %cv2e0 1411 1412 %v1e1 = extractelement <8 x i4> %vec1, i64 1 1413 %cv1e1 = sext i4 %v1e1 to i8 1414 %v2e1 = extractelement <8 x i4> %vec2, i64 1 1415 %cv2e1 = sext i4 %v2e1 to i8 1416 %mul1 = mul nuw nsw i8 %cv1e1, %cv2e1 1417 1418 %v1e2 = extractelement <8 x i4> %vec1, i64 2 1419 %cv1e2 = sext i4 %v1e2 to i8 1420 %v2e2 = extractelement <8 x i4> %vec2, i64 2 1421 %cv2e2 = sext i4 %v2e2 to i8 1422 %mul2 = mul nuw nsw i8 %cv1e2, %cv2e2 1423 1424 %v1e3 = extractelement <8 x i4> %vec1, i64 3 1425 %cv1e3 = sext i4 %v1e3 to i8 1426 %v2e3 = extractelement <8 x i4> %vec2, i64 3 1427 %cv2e3 = sext i4 %v2e3 to i8 1428 %mul3 = mul nuw nsw i8 %cv1e3, %cv2e3 1429 1430 %v1e4 = extractelement <8 x i4> %vec1, i64 4 1431 %cv1e4 = sext i4 %v1e4 to i8 1432 %v2e4 = extractelement <8 x i4> %vec2, i64 4 1433 %cv2e4 = sext i4 %v2e4 to i8 1434 %mul4 = mul nuw nsw i8 %cv1e4, %cv2e4 1435 1436 %v1e5 = extractelement <8 x i4> %vec1, i64 5 1437 %cv1e5 = sext i4 %v1e5 to i8 1438 %v2e5 = extractelement <8 x i4> %vec2, i64 5 1439 %cv2e5 = sext i4 %v2e5 to i8 1440 %mul5 = mul nuw nsw i8 %cv1e5, %cv2e5 1441 1442 %v1e6 = extractelement <8 x i4> %vec1, i64 6 1443 %cv1e6 = sext i4 %v1e6 to i8 1444 %v2e6 = extractelement <8 x i4> %vec2, i64 6 1445 %cv2e6 = sext i4 %v2e6 to i8 1446 %mul6 = mul nuw nsw i8 %cv1e6, %cv2e6 1447 1448 %v1e7 = extractelement <8 x i4> %vec1, i64 7 1449 %cv1e7 = sext i4 %v1e7 to i8 1450 %v2e7 = extractelement <8 x i4> %vec2, i64 7 1451 %cv2e7 = sext i4 %v2e7 to i8 1452 %mul7 = mul nuw nsw i8 %cv1e7, %cv2e7 1453 1454 %acc = load i8, i8 addrspace(1)* %dst, align 4 1455 %add1 = add i8 %mul0, %acc 1456 %add2 = add i8 %add1, %mul1 1457 %add3 = add i8 %add2, %mul2 1458 %add4 = add i8 %add3, %mul3 1459 %add5 = add i8 %add4, %mul4 1460 %add6 = add i8 %add5, %mul5 1461 %add7 = add i8 %add6, %mul6 1462 %add8 = add i8 %add7, %mul7 1463 1464 store i8 %add8, i8 addrspace(1)* %dst, align 4 1465 ret void 1466} 1467 1468; Make sure the pattern is not recognized if there are multiple uses of the 1469; intermediate multiplications. 1470define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, 1471; GFX7-LABEL: idot8_multiuses_mul1: 1472; GFX7: ; %bb.0: ; %entry 1473; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1474; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1475; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1476; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1477; GFX7-NEXT: s_mov_b32 s14, -1 1478; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 1479; GFX7-NEXT: s_add_u32 s12, s12, s3 1480; GFX7-NEXT: s_mov_b32 s3, 0xf000 1481; GFX7-NEXT: s_mov_b32 s10, 0 1482; GFX7-NEXT: s_mov_b32 s11, s3 1483; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1484; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 1485; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1486; GFX7-NEXT: v_mov_b32_e32 v1, 0 1487; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 1488; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 1489; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1490; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 1491; GFX7-NEXT: s_mov_b32 s2, -1 1492; GFX7-NEXT: s_addc_u32 s13, s13, 0 1493; GFX7-NEXT: s_waitcnt vmcnt(1) 1494; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4 1495; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4 1496; GFX7-NEXT: s_waitcnt vmcnt(0) 1497; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4 1498; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1499; GFX7-NEXT: v_mad_i32_i24 v16, v1, v9, s4 1500; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4 1501; GFX7-NEXT: v_mad_i32_i24 v1, v1, v9, v16 1502; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4 1503; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4 1504; GFX7-NEXT: v_mad_i32_i24 v1, v3, v10, v1 1505; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4 1506; GFX7-NEXT: v_bfe_i32 v12, v0, 12, 4 1507; GFX7-NEXT: v_mad_i32_i24 v1, v4, v11, v1 1508; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4 1509; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4 1510; GFX7-NEXT: v_mad_i32_i24 v1, v5, v12, v1 1511; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4 1512; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4 1513; GFX7-NEXT: v_mad_i32_i24 v1, v6, v13, v1 1514; GFX7-NEXT: v_bfe_i32 v8, v2, 24, 4 1515; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4 1516; GFX7-NEXT: v_mad_i32_i24 v1, v7, v14, v1 1517; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 1518; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 1519; GFX7-NEXT: v_mad_i32_i24 v1, v8, v15, v1 1520; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1 1521; GFX7-NEXT: v_add_i32_e32 v0, vcc, v16, v0 1522; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1523; GFX7-NEXT: s_endpgm 1524; 1525; GFX8-LABEL: idot8_multiuses_mul1: 1526; GFX8: ; %bb.0: ; %entry 1527; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1528; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1529; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1530; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1531; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1532; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1533; GFX8-NEXT: v_mov_b32_e32 v1, s5 1534; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1535; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1536; GFX8-NEXT: flat_load_dword v3, v[0:1] 1537; GFX8-NEXT: v_mov_b32_e32 v1, s7 1538; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 1539; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1540; GFX8-NEXT: flat_load_dword v0, v[0:1] 1541; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 1542; GFX8-NEXT: s_mov_b32 s10, -1 1543; GFX8-NEXT: s_mov_b32 s11, 0xe80000 1544; GFX8-NEXT: s_add_u32 s8, s8, s3 1545; GFX8-NEXT: s_addc_u32 s9, s9, 0 1546; GFX8-NEXT: s_waitcnt vmcnt(1) 1547; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4 1548; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4 1549; GFX8-NEXT: v_bfe_i32 v6, v3, 8, 4 1550; GFX8-NEXT: v_bfe_i32 v8, v3, 12, 4 1551; GFX8-NEXT: v_bfe_i32 v10, v3, 16, 4 1552; GFX8-NEXT: v_bfe_i32 v12, v3, 20, 4 1553; GFX8-NEXT: s_waitcnt vmcnt(0) 1554; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 4 1555; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1556; GFX8-NEXT: v_mad_i32_i24 v16, v1, v2, s2 1557; GFX8-NEXT: v_bfe_i32 v5, v0, 4, 4 1558; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, v16 1559; GFX8-NEXT: v_bfe_i32 v7, v0, 8, 4 1560; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1 1561; GFX8-NEXT: v_bfe_i32 v9, v0, 12, 4 1562; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1 1563; GFX8-NEXT: v_bfe_i32 v11, v0, 16, 4 1564; GFX8-NEXT: v_mad_i32_i24 v1, v8, v9, v1 1565; GFX8-NEXT: v_bfe_i32 v13, v0, 20, 4 1566; GFX8-NEXT: v_mad_i32_i24 v1, v10, v11, v1 1567; GFX8-NEXT: v_bfe_i32 v14, v3, 24, 4 1568; GFX8-NEXT: v_bfe_i32 v15, v0, 24, 4 1569; GFX8-NEXT: v_mad_i32_i24 v1, v12, v13, v1 1570; GFX8-NEXT: v_ashrrev_i32_e32 v3, 28, v3 1571; GFX8-NEXT: v_ashrrev_i32_e32 v0, 28, v0 1572; GFX8-NEXT: v_mad_i32_i24 v1, v14, v15, v1 1573; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, v1 1574; GFX8-NEXT: v_add_u32_e32 v2, vcc, v16, v0 1575; GFX8-NEXT: v_mov_b32_e32 v0, s0 1576; GFX8-NEXT: v_mov_b32_e32 v1, s1 1577; GFX8-NEXT: flat_store_dword v[0:1], v2 1578; GFX8-NEXT: s_endpgm 1579; 1580; GFX9-LABEL: idot8_multiuses_mul1: 1581; GFX9: ; %bb.0: ; %entry 1582; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1583; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1584; GFX9-NEXT: s_mov_b32 s10, -1 1585; GFX9-NEXT: s_mov_b32 s11, 0xe00000 1586; GFX9-NEXT: s_add_u32 s8, s8, s3 1587; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1588; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1589; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1590; GFX9-NEXT: s_addc_u32 s9, s9, 0 1591; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1592; GFX9-NEXT: global_load_dword v1, v0, s[4:5] 1593; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 1594; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 1595; GFX9-NEXT: v_mov_b32_e32 v0, 0 1596; GFX9-NEXT: s_waitcnt vmcnt(1) 1597; GFX9-NEXT: v_bfe_i32 v3, v1, 0, 4 1598; GFX9-NEXT: s_waitcnt vmcnt(0) 1599; GFX9-NEXT: v_bfe_i32 v4, v2, 0, 4 1600; GFX9-NEXT: v_bfe_i32 v5, v1, 4, 4 1601; GFX9-NEXT: v_bfe_i32 v6, v2, 4, 4 1602; GFX9-NEXT: v_bfe_i32 v7, v1, 8, 4 1603; GFX9-NEXT: v_bfe_i32 v8, v2, 8, 4 1604; GFX9-NEXT: v_bfe_i32 v9, v1, 12, 4 1605; GFX9-NEXT: v_bfe_i32 v10, v2, 12, 4 1606; GFX9-NEXT: v_bfe_i32 v11, v1, 16, 4 1607; GFX9-NEXT: v_bfe_i32 v12, v2, 16, 4 1608; GFX9-NEXT: v_bfe_i32 v13, v1, 20, 4 1609; GFX9-NEXT: v_bfe_i32 v14, v2, 20, 4 1610; GFX9-NEXT: v_bfe_i32 v15, v1, 24, 4 1611; GFX9-NEXT: v_bfe_i32 v16, v2, 24, 4 1612; GFX9-NEXT: v_ashrrev_i32_e32 v1, 28, v1 1613; GFX9-NEXT: v_ashrrev_i32_e32 v2, 28, v2 1614; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2 1615; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1616; GFX9-NEXT: v_mad_i32_i24 v2, v3, v4, s0 1617; GFX9-NEXT: v_mul_i32_i24_e32 v5, v5, v6 1618; GFX9-NEXT: v_mul_i32_i24_e32 v6, v7, v8 1619; GFX9-NEXT: v_mad_i32_i24 v3, v3, v4, v2 1620; GFX9-NEXT: v_mul_i32_i24_e32 v7, v9, v10 1621; GFX9-NEXT: v_mul_i32_i24_e32 v8, v11, v12 1622; GFX9-NEXT: v_add3_u32 v3, v3, v5, v6 1623; GFX9-NEXT: v_mul_i32_i24_e32 v9, v13, v14 1624; GFX9-NEXT: v_mul_i32_i24_e32 v10, v15, v16 1625; GFX9-NEXT: v_add3_u32 v3, v3, v7, v8 1626; GFX9-NEXT: v_add3_u32 v3, v3, v9, v10 1627; GFX9-NEXT: v_add3_u32 v1, v3, v1, v2 1628; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 1629; GFX9-NEXT: s_endpgm 1630; 1631; GFX9-DL-LABEL: idot8_multiuses_mul1: 1632; GFX9-DL: ; %bb.0: ; %entry 1633; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1634; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1635; GFX9-DL-NEXT: s_mov_b32 s10, -1 1636; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 1637; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 1638; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1639; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1640; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1641; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 1642; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1643; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 1644; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 1645; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 1646; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1647; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 1648; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 4 1649; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1650; GFX9-DL-NEXT: v_bfe_i32 v4, v2, 0, 4 1651; GFX9-DL-NEXT: v_bfe_i32 v5, v1, 4, 4 1652; GFX9-DL-NEXT: v_bfe_i32 v6, v2, 4, 4 1653; GFX9-DL-NEXT: v_bfe_i32 v7, v1, 8, 4 1654; GFX9-DL-NEXT: v_bfe_i32 v8, v2, 8, 4 1655; GFX9-DL-NEXT: v_bfe_i32 v9, v1, 12, 4 1656; GFX9-DL-NEXT: v_bfe_i32 v10, v2, 12, 4 1657; GFX9-DL-NEXT: v_bfe_i32 v11, v1, 16, 4 1658; GFX9-DL-NEXT: v_bfe_i32 v12, v2, 16, 4 1659; GFX9-DL-NEXT: v_bfe_i32 v13, v1, 20, 4 1660; GFX9-DL-NEXT: v_bfe_i32 v14, v2, 20, 4 1661; GFX9-DL-NEXT: v_bfe_i32 v15, v1, 24, 4 1662; GFX9-DL-NEXT: v_bfe_i32 v16, v2, 24, 4 1663; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 28, v1 1664; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 28, v2 1665; GFX9-DL-NEXT: v_mul_i32_i24_e32 v1, v1, v2 1666; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1667; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v4, s0 1668; GFX9-DL-NEXT: v_mul_i32_i24_e32 v5, v5, v6 1669; GFX9-DL-NEXT: v_mul_i32_i24_e32 v6, v7, v8 1670; GFX9-DL-NEXT: v_mad_i32_i24 v3, v3, v4, v2 1671; GFX9-DL-NEXT: v_mul_i32_i24_e32 v7, v9, v10 1672; GFX9-DL-NEXT: v_mul_i32_i24_e32 v8, v11, v12 1673; GFX9-DL-NEXT: v_add3_u32 v3, v3, v5, v6 1674; GFX9-DL-NEXT: v_mul_i32_i24_e32 v9, v13, v14 1675; GFX9-DL-NEXT: v_mul_i32_i24_e32 v10, v15, v16 1676; GFX9-DL-NEXT: v_add3_u32 v3, v3, v7, v8 1677; GFX9-DL-NEXT: v_add3_u32 v3, v3, v9, v10 1678; GFX9-DL-NEXT: v_add3_u32 v1, v3, v1, v2 1679; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 1680; GFX9-DL-NEXT: s_endpgm 1681; 1682; GFX10-DL-XNACK-LABEL: idot8_multiuses_mul1: 1683; GFX10-DL-XNACK: ; %bb.0: ; %entry 1684; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1685; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1686; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1687; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1688; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1689; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 1690; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 1691; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 1692; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 1693; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) 1694; GFX10-DL-XNACK-NEXT: s_clause 0x1 1695; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] 1696; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] 1697; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0 1698; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) 1699; GFX10-DL-XNACK-NEXT: v_bfe_i32 v0, v1, 0, 4 1700; GFX10-DL-XNACK-NEXT: v_bfe_i32 v3, v1, 4, 4 1701; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) 1702; GFX10-DL-XNACK-NEXT: v_bfe_i32 v4, v2, 4, 4 1703; GFX10-DL-XNACK-NEXT: v_bfe_i32 v5, v1, 8, 4 1704; GFX10-DL-XNACK-NEXT: v_bfe_i32 v6, v2, 8, 4 1705; GFX10-DL-XNACK-NEXT: v_bfe_i32 v7, v2, 0, 4 1706; GFX10-DL-XNACK-NEXT: v_bfe_i32 v8, v1, 12, 4 1707; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v3, v3, v4 1708; GFX10-DL-XNACK-NEXT: v_bfe_i32 v9, v2, 12, 4 1709; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6 1710; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) 1711; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v5, v0, v7, s2 1712; GFX10-DL-XNACK-NEXT: v_bfe_i32 v6, v1, 16, 4 1713; GFX10-DL-XNACK-NEXT: v_bfe_i32 v10, v2, 16, 4 1714; GFX10-DL-XNACK-NEXT: v_bfe_i32 v11, v1, 20, 4 1715; GFX10-DL-XNACK-NEXT: v_bfe_i32 v12, v2, 20, 4 1716; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v0, v0, v7, v5 1717; GFX10-DL-XNACK-NEXT: v_bfe_i32 v7, v1, 24, 4 1718; GFX10-DL-XNACK-NEXT: v_bfe_i32 v13, v2, 24, 4 1719; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v8, v8, v9 1720; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v6, v6, v10 1721; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v3, v4 1722; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v3, v11, v12 1723; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v7, v13 1724; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v1, 28, v1 1725; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v2, 28, v2 1726; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v8, v6 1727; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v1, v1, v2 1728; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v3, v4 1729; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v2, 0 1730; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v1, v5 1731; GFX10-DL-XNACK-NEXT: global_store_dword v2, v0, s[0:1] 1732; GFX10-DL-XNACK-NEXT: s_endpgm 1733; 1734; GFX10-DL-NOXNACK-LABEL: idot8_multiuses_mul1: 1735; GFX10-DL-NOXNACK: ; %bb.0: ; %entry 1736; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1737; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1738; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1739; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1740; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1741; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 1742; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 1743; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 1744; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 1745; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) 1746; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 1747; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] 1748; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] 1749; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0 1750; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) 1751; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v2, v1, 0, 4 1752; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v3, v1, 4, 4 1753; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) 1754; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v4, v0, 4, 4 1755; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v5, v1, 8, 4 1756; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v6, v0, 8, 4 1757; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v7, v0, 0, 4 1758; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v8, v1, 12, 4 1759; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v3, v3, v4 1760; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v9, v0, 12, 4 1761; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6 1762; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) 1763; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v5, v2, v7, s2 1764; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v6, v1, 16, 4 1765; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v10, v0, 16, 4 1766; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v11, v1, 20, 4 1767; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v12, v0, 20, 4 1768; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v2, v2, v7, v5 1769; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v7, v1, 24, 4 1770; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v13, v0, 24, 4 1771; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v8, v8, v9 1772; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v6, v6, v10 1773; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v2, v2, v3, v4 1774; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v3, v11, v12 1775; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v7, v13 1776; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v1, 28, v1 1777; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v0, 28, v0 1778; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v2, v2, v8, v6 1779; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v0, v1, v0 1780; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v1, v2, v3, v4 1781; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 1782; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v0, v1, v0, v5 1783; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1] 1784; GFX10-DL-NOXNACK-NEXT: s_endpgm 1785; GFX10-DL-LABEL: idot8_multiuses_mul1: 1786; GFX10-DL: ; %bb.0: ; %entry 1787; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1788; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1789; GFX10-DL-NEXT: s_mov_b32 s10, -1 1790; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 1791; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 1792; GFX10-DL-NEXT: s_clause 0x1 1793; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 1794; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1795; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 1796; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 1797; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1798; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 1799; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 1800; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 1801; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1802; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 1803; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40000 1804; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40000 1805; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0 1806; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v0 1807; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40004 1808; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40004 1809; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 1810; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40008 1811; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40008 1812; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 1813; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x4000c 1814; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x4000c 1815; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 1816; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 1817; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010 1818; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 1819; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40014 1820; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40014 1821; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 1822; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 1823; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018 1824; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 1825; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 1826; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 1827; GFX10-DL-NEXT: v_mad_i32_i24 v1, s0, s1, v1 1828; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v0, v1 1829; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5] 1830; GFX10-DL-NEXT: s_endpgm 1831 <8 x i4> addrspace(1)* %src2, 1832 i32 addrspace(1)* nocapture %dst) { 1833entry: 1834 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1835 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx 1836 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1 1837 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx 1838 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2 1839 1840 %v1e0 = extractelement <8 x i4> %vec1, i64 0 1841 %cv1e0 = sext i4 %v1e0 to i32 1842 %v2e0 = extractelement <8 x i4> %vec2, i64 0 1843 %cv2e0 = sext i4 %v2e0 to i32 1844 %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0 1845 1846 %v1e1 = extractelement <8 x i4> %vec1, i64 1 1847 %cv1e1 = sext i4 %v1e1 to i32 1848 %v2e1 = extractelement <8 x i4> %vec2, i64 1 1849 %cv2e1 = sext i4 %v2e1 to i32 1850 %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1 1851 1852 %v1e2 = extractelement <8 x i4> %vec1, i64 2 1853 %cv1e2 = sext i4 %v1e2 to i32 1854 %v2e2 = extractelement <8 x i4> %vec2, i64 2 1855 %cv2e2 = sext i4 %v2e2 to i32 1856 %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2 1857 1858 %v1e3 = extractelement <8 x i4> %vec1, i64 3 1859 %cv1e3 = sext i4 %v1e3 to i32 1860 %v2e3 = extractelement <8 x i4> %vec2, i64 3 1861 %cv2e3 = sext i4 %v2e3 to i32 1862 %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3 1863 1864 %v1e4 = extractelement <8 x i4> %vec1, i64 4 1865 %cv1e4 = sext i4 %v1e4 to i32 1866 %v2e4 = extractelement <8 x i4> %vec2, i64 4 1867 %cv2e4 = sext i4 %v2e4 to i32 1868 %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4 1869 1870 %v1e5 = extractelement <8 x i4> %vec1, i64 5 1871 %cv1e5 = sext i4 %v1e5 to i32 1872 %v2e5 = extractelement <8 x i4> %vec2, i64 5 1873 %cv2e5 = sext i4 %v2e5 to i32 1874 %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5 1875 1876 %v1e6 = extractelement <8 x i4> %vec1, i64 6 1877 %cv1e6 = sext i4 %v1e6 to i32 1878 %v2e6 = extractelement <8 x i4> %vec2, i64 6 1879 %cv2e6 = sext i4 %v2e6 to i32 1880 %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6 1881 1882 %v1e7 = extractelement <8 x i4> %vec1, i64 7 1883 %cv1e7 = sext i4 %v1e7 to i32 1884 %v2e7 = extractelement <8 x i4> %vec2, i64 7 1885 %cv2e7 = sext i4 %v2e7 to i32 1886 %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7 1887 1888 %acc = load i32, i32 addrspace(1)* %dst, align 4 1889 %add = add i32 %mul0, %acc 1890 %add1 = add i32 %mul0, %add 1891 %add2 = add i32 %add1, %mul1 1892 %add3 = add i32 %add2, %mul2 1893 %add4 = add i32 %add3, %mul3 1894 %add5 = add i32 %add4, %mul4 1895 %add6 = add i32 %add5, %mul5 1896 %add7 = add i32 %add6, %mul6 1897 %add8 = add i32 %add7, %mul7 1898 1899 %res = add i32 %add, %add8 1900 store i32 %res, i32 addrspace(1)* %dst, align 4 1901 ret void 1902} 1903 1904; TODO: Support this pattern. 1905define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, 1906; GFX7-LABEL: idot8_acc32_vecMul: 1907; GFX7: ; %bb.0: ; %entry 1908; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1909; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1910; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1911; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1912; GFX7-NEXT: s_mov_b32 s14, -1 1913; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 1914; GFX7-NEXT: s_add_u32 s12, s12, s3 1915; GFX7-NEXT: s_mov_b32 s3, 0xf000 1916; GFX7-NEXT: s_mov_b32 s10, 0 1917; GFX7-NEXT: s_mov_b32 s11, s3 1918; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1919; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 1920; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1921; GFX7-NEXT: v_mov_b32_e32 v1, 0 1922; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 1923; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 1924; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1925; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 1926; GFX7-NEXT: s_mov_b32 s2, -1 1927; GFX7-NEXT: s_addc_u32 s13, s13, 0 1928; GFX7-NEXT: s_waitcnt vmcnt(1) 1929; GFX7-NEXT: v_ashrrev_i32_e32 v1, 28, v2 1930; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4 1931; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4 1932; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 4 1933; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4 1934; GFX7-NEXT: v_bfe_i32 v7, v2, 8, 4 1935; GFX7-NEXT: v_bfe_i32 v8, v2, 4, 4 1936; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 4 1937; GFX7-NEXT: s_waitcnt vmcnt(0) 1938; GFX7-NEXT: v_ashrrev_i32_e32 v9, 28, v0 1939; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4 1940; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4 1941; GFX7-NEXT: v_bfe_i32 v12, v0, 16, 4 1942; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4 1943; GFX7-NEXT: v_bfe_i32 v14, v0, 8, 4 1944; GFX7-NEXT: v_bfe_i32 v15, v0, 4, 4 1945; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 4 1946; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1947; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, s4 1948; GFX7-NEXT: v_mad_i32_i24 v0, v8, v15, v0 1949; GFX7-NEXT: v_mad_i32_i24 v0, v7, v14, v0 1950; GFX7-NEXT: v_mad_i32_i24 v0, v6, v13, v0 1951; GFX7-NEXT: v_mad_i32_i24 v0, v5, v12, v0 1952; GFX7-NEXT: v_mad_i32_i24 v0, v4, v11, v0 1953; GFX7-NEXT: v_mad_i32_i24 v0, v3, v10, v0 1954; GFX7-NEXT: v_mad_i32_i24 v0, v1, v9, v0 1955; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1956; GFX7-NEXT: s_endpgm 1957; 1958; GFX8-LABEL: idot8_acc32_vecMul: 1959; GFX8: ; %bb.0: ; %entry 1960; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1961; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1962; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1963; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1964; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1965; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1966; GFX8-NEXT: v_mov_b32_e32 v1, s5 1967; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1968; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1969; GFX8-NEXT: flat_load_dword v3, v[0:1] 1970; GFX8-NEXT: v_mov_b32_e32 v1, s7 1971; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 1972; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1973; GFX8-NEXT: flat_load_dword v0, v[0:1] 1974; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 1975; GFX8-NEXT: s_mov_b32 s10, -1 1976; GFX8-NEXT: s_mov_b32 s11, 0xe80000 1977; GFX8-NEXT: s_add_u32 s8, s8, s3 1978; GFX8-NEXT: s_addc_u32 s9, s9, 0 1979; GFX8-NEXT: s_waitcnt vmcnt(1) 1980; GFX8-NEXT: v_ashrrev_i32_e32 v1, 28, v3 1981; GFX8-NEXT: v_bfe_i32 v2, v3, 24, 4 1982; GFX8-NEXT: v_bfe_i32 v4, v3, 20, 4 1983; GFX8-NEXT: v_bfe_i32 v5, v3, 16, 4 1984; GFX8-NEXT: v_bfe_i32 v6, v3, 12, 4 1985; GFX8-NEXT: v_bfe_i32 v7, v3, 8, 4 1986; GFX8-NEXT: v_bfe_i32 v8, v3, 4, 4 1987; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 4 1988; GFX8-NEXT: s_waitcnt vmcnt(0) 1989; GFX8-NEXT: v_ashrrev_i32_e32 v9, 28, v0 1990; GFX8-NEXT: v_bfe_i32 v10, v0, 24, 4 1991; GFX8-NEXT: v_bfe_i32 v11, v0, 20, 4 1992; GFX8-NEXT: v_bfe_i32 v12, v0, 16, 4 1993; GFX8-NEXT: v_bfe_i32 v13, v0, 12, 4 1994; GFX8-NEXT: v_bfe_i32 v14, v0, 8, 4 1995; GFX8-NEXT: v_bfe_i32 v15, v0, 4, 4 1996; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 4 1997; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1998; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, s2 1999; GFX8-NEXT: v_mad_i32_i24 v0, v8, v15, v0 2000; GFX8-NEXT: v_mad_i32_i24 v0, v7, v14, v0 2001; GFX8-NEXT: v_mad_i32_i24 v0, v6, v13, v0 2002; GFX8-NEXT: v_mad_i32_i24 v0, v5, v12, v0 2003; GFX8-NEXT: v_mad_i32_i24 v0, v4, v11, v0 2004; GFX8-NEXT: v_mad_i32_i24 v0, v2, v10, v0 2005; GFX8-NEXT: v_mad_i32_i24 v2, v1, v9, v0 2006; GFX8-NEXT: v_mov_b32_e32 v0, s0 2007; GFX8-NEXT: v_mov_b32_e32 v1, s1 2008; GFX8-NEXT: flat_store_dword v[0:1], v2 2009; GFX8-NEXT: s_endpgm 2010; 2011; GFX9-LABEL: idot8_acc32_vecMul: 2012; GFX9: ; %bb.0: ; %entry 2013; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2014; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2015; GFX9-NEXT: s_mov_b32 s10, -1 2016; GFX9-NEXT: s_mov_b32 s11, 0xe00000 2017; GFX9-NEXT: s_add_u32 s8, s8, s3 2018; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2019; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2020; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2021; GFX9-NEXT: s_addc_u32 s9, s9, 0 2022; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2023; GFX9-NEXT: global_load_dword v1, v0, s[4:5] 2024; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 2025; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 2026; GFX9-NEXT: v_mov_b32_e32 v0, 0 2027; GFX9-NEXT: s_waitcnt vmcnt(1) 2028; GFX9-NEXT: v_ashrrev_i32_e32 v3, 28, v1 2029; GFX9-NEXT: v_bfe_i32 v4, v1, 24, 4 2030; GFX9-NEXT: v_bfe_i32 v5, v1, 20, 4 2031; GFX9-NEXT: v_bfe_i32 v6, v1, 16, 4 2032; GFX9-NEXT: v_bfe_i32 v7, v1, 12, 4 2033; GFX9-NEXT: v_bfe_i32 v8, v1, 8, 4 2034; GFX9-NEXT: v_bfe_i32 v9, v1, 4, 4 2035; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 4 2036; GFX9-NEXT: s_waitcnt vmcnt(0) 2037; GFX9-NEXT: v_ashrrev_i32_e32 v10, 28, v2 2038; GFX9-NEXT: v_bfe_i32 v11, v2, 24, 4 2039; GFX9-NEXT: v_bfe_i32 v12, v2, 20, 4 2040; GFX9-NEXT: v_bfe_i32 v13, v2, 16, 4 2041; GFX9-NEXT: v_bfe_i32 v14, v2, 12, 4 2042; GFX9-NEXT: v_bfe_i32 v15, v2, 8, 4 2043; GFX9-NEXT: v_bfe_i32 v16, v2, 4, 4 2044; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 4 2045; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2 2046; GFX9-NEXT: v_mul_i32_i24_e32 v2, v9, v16 2047; GFX9-NEXT: v_mul_i32_i24_e32 v8, v8, v15 2048; GFX9-NEXT: v_mul_i32_i24_e32 v7, v7, v14 2049; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2050; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2 2051; GFX9-NEXT: v_mul_i32_i24_e32 v6, v6, v13 2052; GFX9-NEXT: v_mul_i32_i24_e32 v5, v5, v12 2053; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7 2054; GFX9-NEXT: v_mul_i32_i24_e32 v4, v4, v11 2055; GFX9-NEXT: v_mul_i32_i24_e32 v3, v3, v10 2056; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5 2057; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3 2058; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 2059; GFX9-NEXT: s_endpgm 2060; 2061; GFX9-DL-LABEL: idot8_acc32_vecMul: 2062; GFX9-DL: ; %bb.0: ; %entry 2063; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2064; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2065; GFX9-DL-NEXT: s_mov_b32 s10, -1 2066; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 2067; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 2068; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2069; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2070; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2071; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 2072; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2073; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 2074; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 2075; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 2076; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2077; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2078; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0 2079; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 2080; GFX9-DL-NEXT: s_endpgm 2081; 2082; GFX10-DL-XNACK-LABEL: idot8_acc32_vecMul: 2083; GFX10-DL-XNACK: ; %bb.0: ; %entry 2084; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2085; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2086; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2087; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2088; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2089; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 2090; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 2091; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 2092; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 2093; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) 2094; GFX10-DL-XNACK-NEXT: s_clause 0x1 2095; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] 2096; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] 2097; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 2098; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0 2099; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2100; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s2 2101; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[0:1] 2102; GFX10-DL-XNACK-NEXT: s_endpgm 2103; 2104; GFX10-DL-NOXNACK-LABEL: idot8_acc32_vecMul: 2105; GFX10-DL-NOXNACK: ; %bb.0: ; %entry 2106; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2107; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2108; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2109; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2110; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 2111; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2112; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 2113; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 2114; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 2115; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 2116; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) 2117; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 2118; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] 2119; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] 2120; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0 2121; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2122; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s2 2123; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1] 2124; GFX10-DL-NOXNACK-NEXT: s_endpgm 2125; GFX10-DL-LABEL: idot8_acc32_vecMul: 2126; GFX10-DL: ; %bb.0: ; %entry 2127; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2128; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2129; GFX10-DL-NEXT: s_mov_b32 s10, -1 2130; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 2131; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 2132; GFX10-DL-NEXT: s_clause 0x1 2133; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 2134; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2135; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 2136; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 2137; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2138; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 2139; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 2140; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 2141; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2142; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 2143; GFX10-DL-NEXT: v_dot8_i32_i4 v0, s0, s1, v0 2144; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 2145; GFX10-DL-NEXT: s_endpgm 2146 <8 x i4> addrspace(1)* %src2, 2147 i32 addrspace(1)* nocapture %dst) { 2148entry: 2149 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2150 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx 2151 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1 2152 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx 2153 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2 2154 2155 %cvec1 = sext <8 x i4> %vec1 to <8 x i32> 2156 %cvec2 = sext <8 x i4> %vec2 to <8 x i32> 2157 2158 %mul = mul <8 x i32> %cvec1, %cvec2 2159 %mul0 = extractelement <8 x i32> %mul, i64 0 2160 %mul1 = extractelement <8 x i32> %mul, i64 1 2161 %mul2 = extractelement <8 x i32> %mul, i64 2 2162 %mul3 = extractelement <8 x i32> %mul, i64 3 2163 %mul4 = extractelement <8 x i32> %mul, i64 4 2164 %mul5 = extractelement <8 x i32> %mul, i64 5 2165 %mul6 = extractelement <8 x i32> %mul, i64 6 2166 %mul7 = extractelement <8 x i32> %mul, i64 7 2167 2168 %acc = load i32, i32 addrspace(1)* %dst, align 4 2169 %add1 = add i32 %mul0, %acc 2170 %add2 = add i32 %add1, %mul1 2171 %add3 = add i32 %add2, %mul2 2172 %add4 = add i32 %add3, %mul3 2173 %add5 = add i32 %add4, %mul4 2174 %add6 = add i32 %add5, %mul5 2175 %add7 = add i32 %add6, %mul6 2176 %add8 = add i32 %add7, %mul7 2177 2178 store i32 %add8, i32 addrspace(1)* %dst, align 4 2179 ret void 2180} 2181 2182; TODO: Support this pattern. 2183define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, 2184; GFX7-LABEL: idot8_acc16_vecMul: 2185; GFX7: ; %bb.0: ; %entry 2186; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2187; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2188; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2189; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2190; GFX7-NEXT: s_mov_b32 s14, -1 2191; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 2192; GFX7-NEXT: s_add_u32 s12, s12, s3 2193; GFX7-NEXT: s_mov_b32 s3, 0xf000 2194; GFX7-NEXT: s_mov_b32 s10, 0 2195; GFX7-NEXT: s_mov_b32 s11, s3 2196; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2197; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 2198; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2199; GFX7-NEXT: v_mov_b32_e32 v1, 0 2200; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 2201; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 2202; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2203; GFX7-NEXT: s_mov_b32 s2, -1 2204; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 2205; GFX7-NEXT: s_addc_u32 s13, s13, 0 2206; GFX7-NEXT: s_waitcnt vmcnt(2) 2207; GFX7-NEXT: v_bfe_i32 v3, v2, 20, 4 2208; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 4 2209; GFX7-NEXT: v_bfe_i32 v5, v2, 4, 4 2210; GFX7-NEXT: v_bfe_i32 v6, v2, 0, 4 2211; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2212; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 2213; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 2214; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 2215; GFX7-NEXT: s_waitcnt vmcnt(1) 2216; GFX7-NEXT: v_bfe_i32 v10, v0, 20, 4 2217; GFX7-NEXT: v_bfe_i32 v11, v0, 16, 4 2218; GFX7-NEXT: v_bfe_i32 v12, v0, 4, 4 2219; GFX7-NEXT: v_bfe_i32 v13, v0, 0, 4 2220; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 2221; GFX7-NEXT: v_or_b32_e32 v4, v6, v5 2222; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v10 2223; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v11 2224; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v12 2225; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v13 2226; GFX7-NEXT: v_bfe_i32 v14, v0, 24, 4 2227; GFX7-NEXT: v_ashrrev_i32_e32 v16, 28, v0 2228; GFX7-NEXT: v_or_b32_e32 v5, v6, v5 2229; GFX7-NEXT: v_or_b32_e32 v6, v11, v10 2230; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v14 2231; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v16 2232; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v4 2233; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 2234; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v6 2235; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 2236; GFX7-NEXT: v_bfe_i32 v8, v2, 8, 4 2237; GFX7-NEXT: v_bfe_i32 v15, v0, 8, 4 2238; GFX7-NEXT: s_waitcnt vmcnt(0) 2239; GFX7-NEXT: v_mad_u32_u24 v1, v4, v6, v1 2240; GFX7-NEXT: v_bfe_i32 v7, v2, 24, 4 2241; GFX7-NEXT: v_ashrrev_i32_e32 v9, 28, v2 2242; GFX7-NEXT: v_bfe_i32 v2, v2, 12, 4 2243; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8 2244; GFX7-NEXT: v_bfe_i32 v0, v0, 12, 4 2245; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v15 2246; GFX7-NEXT: v_mad_u32_u24 v1, v16, v11, v1 2247; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 2248; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 2249; GFX7-NEXT: v_mad_u32_u24 v1, v8, v13, v1 2250; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v3 2251; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 2252; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v5 2253; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 2254; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 2255; GFX7-NEXT: v_mad_u32_u24 v0, v3, v5, v0 2256; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 2257; GFX7-NEXT: v_mad_u32_u24 v0, v15, v10, v0 2258; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v9 2259; GFX7-NEXT: v_mad_u32_u24 v0, v7, v12, v0 2260; GFX7-NEXT: v_mad_u32_u24 v0, v9, v14, v0 2261; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 2262; GFX7-NEXT: s_endpgm 2263; 2264; GFX8-LABEL: idot8_acc16_vecMul: 2265; GFX8: ; %bb.0: ; %entry 2266; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2267; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2268; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2269; GFX8-NEXT: v_mov_b32_e32 v5, 12 2270; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2271; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2272; GFX8-NEXT: v_mov_b32_e32 v1, s5 2273; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 2274; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2275; GFX8-NEXT: flat_load_dword v3, v[0:1] 2276; GFX8-NEXT: v_mov_b32_e32 v1, s7 2277; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 2278; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2279; GFX8-NEXT: flat_load_dword v2, v[0:1] 2280; GFX8-NEXT: v_mov_b32_e32 v0, s0 2281; GFX8-NEXT: v_mov_b32_e32 v1, s1 2282; GFX8-NEXT: flat_load_ushort v4, v[0:1] 2283; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2284; GFX8-NEXT: s_mov_b32 s10, -1 2285; GFX8-NEXT: s_mov_b32 s11, 0xe80000 2286; GFX8-NEXT: s_add_u32 s8, s8, s3 2287; GFX8-NEXT: s_addc_u32 s9, s9, 0 2288; GFX8-NEXT: s_waitcnt vmcnt(2) 2289; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 2290; GFX8-NEXT: v_lshlrev_b16_sdwa v7, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2291; GFX8-NEXT: v_lshrrev_b32_e32 v8, 20, v3 2292; GFX8-NEXT: v_lshlrev_b16_sdwa v9, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2293; GFX8-NEXT: v_lshrrev_b32_e32 v10, 12, v3 2294; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v3 2295; GFX8-NEXT: v_lshrrev_b32_e32 v12, 4, v3 2296; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v3 2297; GFX8-NEXT: s_waitcnt vmcnt(1) 2298; GFX8-NEXT: v_lshrrev_b32_e32 v13, 28, v2 2299; GFX8-NEXT: v_lshlrev_b16_sdwa v14, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2300; GFX8-NEXT: v_lshrrev_b32_e32 v15, 20, v2 2301; GFX8-NEXT: v_lshlrev_b16_sdwa v5, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2302; GFX8-NEXT: v_lshrrev_b32_e32 v16, 12, v2 2303; GFX8-NEXT: v_lshrrev_b32_e32 v17, 8, v2 2304; GFX8-NEXT: v_lshrrev_b32_e32 v18, 4, v2 2305; GFX8-NEXT: v_lshlrev_b16_e32 v2, 12, v2 2306; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 2307; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 2308; GFX8-NEXT: v_lshlrev_b16_e32 v18, 12, v18 2309; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2 2310; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11 2311; GFX8-NEXT: v_lshlrev_b16_e32 v17, 12, v17 2312; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 2313; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18 2314; GFX8-NEXT: s_waitcnt vmcnt(0) 2315; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 2316; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10 2317; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v16 2318; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 2319; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17 2320; GFX8-NEXT: v_mad_u16 v2, v12, v18, v2 2321; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 2322; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16 2323; GFX8-NEXT: v_mad_u16 v2, v11, v17, v2 2324; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8 2325; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 2326; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15 2327; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 2328; GFX8-NEXT: v_mad_u16 v2, v10, v16, v2 2329; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 2330; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15 2331; GFX8-NEXT: v_mad_u16 v2, v9, v5, v2 2332; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6 2333; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 2334; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13 2335; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 2336; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 2337; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 2338; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 2339; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 2340; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 2341; GFX8-NEXT: flat_store_short v[0:1], v2 2342; GFX8-NEXT: s_endpgm 2343; 2344; GFX9-LABEL: idot8_acc16_vecMul: 2345; GFX9: ; %bb.0: ; %entry 2346; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2347; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2348; GFX9-NEXT: s_mov_b32 s10, -1 2349; GFX9-NEXT: s_mov_b32 s11, 0xe00000 2350; GFX9-NEXT: s_add_u32 s8, s8, s3 2351; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2352; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2353; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2354; GFX9-NEXT: v_mov_b32_e32 v4, 12 2355; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2356; GFX9-NEXT: global_load_dword v1, v0, s[4:5] 2357; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 2358; GFX9-NEXT: v_mov_b32_e32 v0, 0 2359; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] 2360; GFX9-NEXT: s_addc_u32 s9, s9, 0 2361; GFX9-NEXT: s_waitcnt vmcnt(2) 2362; GFX9-NEXT: v_lshrrev_b32_e32 v5, 4, v1 2363; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v1 2364; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1 2365; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 2366; GFX9-NEXT: v_lshrrev_b32_e32 v9, 20, v1 2367; GFX9-NEXT: v_lshlrev_b16_sdwa v10, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2368; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v1 2369; GFX9-NEXT: s_waitcnt vmcnt(1) 2370; GFX9-NEXT: v_lshrrev_b32_e32 v12, 4, v2 2371; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2372; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v2 2373; GFX9-NEXT: v_lshrrev_b32_e32 v14, 12, v2 2374; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v2 2375; GFX9-NEXT: v_lshrrev_b32_e32 v16, 20, v2 2376; GFX9-NEXT: v_lshlrev_b16_sdwa v17, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2377; GFX9-NEXT: v_lshrrev_b32_e32 v18, 28, v2 2378; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2379; GFX9-NEXT: v_lshlrev_b16_e32 v4, 12, v5 2380; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v6 2381; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v7 2382; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v8 2383; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v9 2384; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v10 2385; GFX9-NEXT: v_lshlrev_b16_e32 v10, 12, v11 2386; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v12 2387; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v13 2388; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 2389; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 2390; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 2391; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 2392; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v14 2393; GFX9-NEXT: v_lshlrev_b16_e32 v14, 12, v15 2394; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v7 2395; GFX9-NEXT: v_lshl_or_b32 v7, v11, 16, v12 2396; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v5 2397; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 2398; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13 2399; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14 2400; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7 2401; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v16 2402; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v17 2403; GFX9-NEXT: v_lshlrev_b16_e32 v17, 12, v18 2404; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v9 2405; GFX9-NEXT: v_lshl_or_b32 v9, v13, 16, v14 2406; GFX9-NEXT: s_waitcnt vmcnt(0) 2407; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 2408; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1 2409; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 2410; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 2411; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v15 2412; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v17 2413; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9 2414; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2415; GFX9-NEXT: v_lshl_or_b32 v2, v17, 16, v2 2416; GFX9-NEXT: v_lshl_or_b32 v1, v10, 16, v1 2417; GFX9-NEXT: v_lshl_or_b32 v10, v15, 16, v16 2418; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 2419; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 2420; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10 2421; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2422; GFX9-NEXT: v_add_u16_e32 v3, v3, v2 2423; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2424; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 2425; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2426; GFX9-NEXT: global_store_short v0, v1, s[2:3] 2427; GFX9-NEXT: s_endpgm 2428; 2429; GFX9-DL-LABEL: idot8_acc16_vecMul: 2430; GFX9-DL: ; %bb.0: ; %entry 2431; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2432; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2433; GFX9-DL-NEXT: s_mov_b32 s10, -1 2434; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 2435; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 2436; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2437; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2438; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2439; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 2440; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2441; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 2442; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 2443; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2444; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] 2445; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 2446; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 2447; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 4, v1 2448; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v1 2449; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1 2450; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 2451; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 20, v1 2452; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v10, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2453; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v1 2454; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2455; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 4, v2 2456; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2457; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v2 2458; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 12, v2 2459; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 8, v2 2460; GFX9-DL-NEXT: v_lshrrev_b32_e32 v16, 20, v2 2461; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v17, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2462; GFX9-DL-NEXT: v_lshrrev_b32_e32 v18, 28, v2 2463; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2464; GFX9-DL-NEXT: v_lshlrev_b16_e32 v4, 12, v5 2465; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v6 2466; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v7 2467; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v8 2468; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v9 2469; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v10 2470; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v11 2471; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v12 2472; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v13 2473; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 2474; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 2475; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 2476; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 2477; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v14 2478; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v15 2479; GFX9-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7 2480; GFX9-DL-NEXT: v_lshl_or_b32 v7, v11, 16, v12 2481; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 2482; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 2483; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13 2484; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14 2485; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 2486; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v16 2487; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v17 2488; GFX9-DL-NEXT: v_lshlrev_b16_e32 v17, 12, v18 2489; GFX9-DL-NEXT: v_lshl_or_b32 v8, v8, 16, v9 2490; GFX9-DL-NEXT: v_lshl_or_b32 v9, v13, 16, v14 2491; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2492; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 2493; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1 2494; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 2495; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 2496; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v15 2497; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v17 2498; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9 2499; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2500; GFX9-DL-NEXT: v_lshl_or_b32 v2, v17, 16, v2 2501; GFX9-DL-NEXT: v_lshl_or_b32 v1, v10, 16, v1 2502; GFX9-DL-NEXT: v_lshl_or_b32 v10, v15, 16, v16 2503; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 2504; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 2505; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10 2506; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2507; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2 2508; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2509; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 2510; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2511; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] 2512; GFX9-DL-NEXT: s_endpgm 2513; 2514; GFX10-DL-XNACK-LABEL: idot8_acc16_vecMul: 2515; GFX10-DL-XNACK: ; %bb.0: ; %entry 2516; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2517; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2518; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2519; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2520; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2521; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 2522; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 2523; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 2524; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 2525; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) 2526; GFX10-DL-XNACK-NEXT: s_clause 0x1 2527; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] 2528; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] 2529; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 2530; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1] 2531; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) 2532; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 4, v1 2533; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v1 2534; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) 2535; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 4, v2 2536; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v2 2537; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 8, v1 2538; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 8, v2 2539; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4 2540; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 2541; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v11 2542; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12 2543; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 12, v1 2544; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 12, v2 2545; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 2546; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14 2547; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4 2548; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11 2549; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v12, 0xffff, v12 2550; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v5, 0xffff, v5 2551; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 16, v1 2552; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 16, v2 2553; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 2554; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v13 2555; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 2556; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v14 2557; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v11, v11, 16, v12 2558; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v4, v4, 16, v5 2559; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 20, v1 2560; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 20, v2 2561; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 2562; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 2563; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 2564; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v13 2565; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v12, 0xffff, v14 2566; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v7, 0xffff, v7 2567; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v4, v11 2568; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 28, v1 2569; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 24, v1 2570; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 28, v2 2571; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 24, v2 2572; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 2573; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15 2574; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 2575; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v13, 12, v16 2576; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v5, v5, 16, v12 2577; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v6, v6, 16, v7 2578; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v4 2579; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) 2580; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v4, v3 2581; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 2582; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 2583; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v15 2584; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 2585; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v4, 0xffff, v13 2586; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v9, 0xffff, v9 2587; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v5, v6, v5 2588; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v7 2589; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 2590; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v17 2591; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 2592; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 2593; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v4, v11, 16, v4 2594; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v6, v8, 16, v9 2595; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v5 2596; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v5 2597; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 2598; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v17 2599; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v2, 0xffff, v2 2600; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v1, 0xffff, v1 2601; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v6, v4 2602; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v7 2603; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v2, v5, 16, v2 2604; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v10, 16, v1 2605; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 2606; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v4 2607; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v2 2608; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v2, v3, v5 2609; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2610; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v2, v1 2611; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3 2612; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1] 2613; GFX10-DL-XNACK-NEXT: s_endpgm 2614; 2615; GFX10-DL-NOXNACK-LABEL: idot8_acc16_vecMul: 2616; GFX10-DL-NOXNACK: ; %bb.0: ; %entry 2617; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2618; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2619; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2620; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 2621; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2622; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2623; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 2624; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 2625; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 2626; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 2627; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) 2628; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 2629; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] 2630; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] 2631; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1] 2632; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) 2633; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 4, v1 2634; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v1 2635; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) 2636; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 4, v0 2637; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v0 2638; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 8, v1 2639; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 8, v0 2640; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 2641; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 2642; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11 2643; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v12 2644; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 12, v1 2645; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 12, v0 2646; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 2647; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14 2648; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4 2649; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11 2650; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v12, 0xffff, v12 2651; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v5, 0xffff, v5 2652; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 16, v1 2653; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 16, v0 2654; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 2655; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13 2656; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 2657; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v14 2658; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v11, v11, 16, v12 2659; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v4, v4, 16, v5 2660; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 20, v1 2661; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 20, v0 2662; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 2663; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 2664; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 2665; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v13 2666; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v12, 0xffff, v14 2667; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v7, 0xffff, v7 2668; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v4, v11 2669; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 28, v1 2670; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 24, v1 2671; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 28, v0 2672; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 24, v0 2673; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 2674; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15 2675; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 2676; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v16 2677; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v5, v5, 16, v12 2678; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v6, v6, 16, v7 2679; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v4 2680; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) 2681; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v4, v3 2682; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 2683; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 2684; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v15 2685; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 2686; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v4, 0xffff, v13 2687; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v9, 0xffff, v9 2688; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v5, v6, v5 2689; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v7 2690; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 2691; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v17 2692; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 2693; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 2694; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v4, v11, 16, v4 2695; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v6, v8, 16, v9 2696; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v5 2697; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v5 2698; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 2699; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v17 2700; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v0, 0xffff, v0 2701; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v1, 0xffff, v1 2702; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v6, v4 2703; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v7 2704; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v0, v5, 16, v0 2705; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v10, 16, v1 2706; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 2707; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v4 2708; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v1, v0 2709; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v3, v5 2710; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v0 2711; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v1, v0 2712; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v3 2713; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1] 2714; GFX10-DL-NOXNACK-NEXT: s_endpgm 2715; GFX10-DL-LABEL: idot8_acc16_vecMul: 2716; GFX10-DL: ; %bb.0: ; %entry 2717; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 2718; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 2719; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2720; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2721; GFX10-DL-NEXT: s_mov_b32 s14, -1 2722; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 2723; GFX10-DL-NEXT: s_add_u32 s12, s12, s3 2724; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2725; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 2726; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2727; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] 2728; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 2729; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 2730; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2731; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 2732; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 28 2733; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 2734; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x40014 2735; GFX10-DL-NEXT: s_bfe_u32 s8, s0, 0x40008 2736; GFX10-DL-NEXT: s_bfe_u32 s9, s0, 0x4000c 2737; GFX10-DL-NEXT: s_and_b32 s10, s0, 15 2738; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x40004 2739; GFX10-DL-NEXT: s_and_b32 s11, s1, 15 2740; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s10, s0 2741; GFX10-DL-NEXT: s_bfe_u32 s10, s1, 0x40004 2742; GFX10-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s0 op_sel_hi:[0,1] 2743; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s11, s10 2744; GFX10-DL-NEXT: s_bfe_u32 s11, s1, 0x4000c 2745; GFX10-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] 2746; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40008 2747; GFX10-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] 2748; GFX10-DL-NEXT: s_pack_ll_b32_b16 s8, s8, s9 2749; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s11 2750; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] 2751; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s8 op_sel_hi:[0,1] 2752; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] 2753; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40010 2754; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40014 2755; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v3 2756; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v4 op_sel_hi:[0,1] 2757; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1] 2758; GFX10-DL-NEXT: s_pack_ll_b32_b16 s6, s6, s7 2759; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s8, s0 2760; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s6 op_sel_hi:[0,1] 2761; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] 2762; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 2763; GFX10-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 2764; GFX10-DL-NEXT: s_lshr_b32 s0, s1, 28 2765; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s2, s3 2766; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1] 2767; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s10, s0 2768; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] 2769; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2770; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1 2771; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2772; GFX10-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v5 op_sel_hi:[0,1] 2773; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s1 op_sel_hi:[0,1] 2774; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 2775; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v4 2776; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1] 2777; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2778; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v5 op_sel_hi:[0,1] 2779; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 2780; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 2781; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2782; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v3 2783; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2784; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] 2785; GFX10-DL-NEXT: s_endpgm 2786 <8 x i4> addrspace(1)* %src2, 2787 i16 addrspace(1)* nocapture %dst) { 2788entry: 2789 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2790 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx 2791 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1 2792 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx 2793 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2 2794 2795 %cvec1 = sext <8 x i4> %vec1 to <8 x i16> 2796 %cvec2 = sext <8 x i4> %vec2 to <8 x i16> 2797 2798 %mul = mul <8 x i16> %cvec1, %cvec2 2799 %mul0 = extractelement <8 x i16> %mul, i64 0 2800 %mul1 = extractelement <8 x i16> %mul, i64 1 2801 %mul2 = extractelement <8 x i16> %mul, i64 2 2802 %mul3 = extractelement <8 x i16> %mul, i64 3 2803 %mul4 = extractelement <8 x i16> %mul, i64 4 2804 %mul5 = extractelement <8 x i16> %mul, i64 5 2805 %mul6 = extractelement <8 x i16> %mul, i64 6 2806 %mul7 = extractelement <8 x i16> %mul, i64 7 2807 2808 %acc = load i16, i16 addrspace(1)* %dst, align 4 2809 %add1 = add i16 %mul0, %acc 2810 %add2 = add i16 %add1, %mul1 2811 %add3 = add i16 %add2, %mul2 2812 %add4 = add i16 %add3, %mul3 2813 %add5 = add i16 %add4, %mul4 2814 %add6 = add i16 %add5, %mul5 2815 %add7 = add i16 %add6, %mul6 2816 %add8 = add i16 %add7, %mul7 2817 2818 store i16 %add8, i16 addrspace(1)* %dst, align 4 2819 ret void 2820} 2821 2822; TODO: Support this pattern. 2823define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, 2824; GFX7-LABEL: idot8_acc8_vecMul: 2825; GFX7: ; %bb.0: ; %entry 2826; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2827; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2828; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2829; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2830; GFX7-NEXT: s_mov_b32 s14, -1 2831; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 2832; GFX7-NEXT: s_add_u32 s12, s12, s3 2833; GFX7-NEXT: s_mov_b32 s3, 0xf000 2834; GFX7-NEXT: s_mov_b32 s10, 0 2835; GFX7-NEXT: s_mov_b32 s11, s3 2836; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2837; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 2838; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2839; GFX7-NEXT: v_mov_b32_e32 v1, 0 2840; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 2841; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 2842; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2843; GFX7-NEXT: s_mov_b32 s2, -1 2844; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 2845; GFX7-NEXT: s_addc_u32 s13, s13, 0 2846; GFX7-NEXT: s_waitcnt vmcnt(2) 2847; GFX7-NEXT: v_ashrrev_i32_e32 v3, 28, v2 2848; GFX7-NEXT: v_bfe_i32 v4, v2, 24, 4 2849; GFX7-NEXT: v_bfe_i32 v5, v2, 20, 4 2850; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4 2851; GFX7-NEXT: v_bfe_i32 v7, v2, 12, 4 2852; GFX7-NEXT: v_bfe_i32 v8, v2, 8, 4 2853; GFX7-NEXT: v_bfe_i32 v9, v2, 4, 4 2854; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 4 2855; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 2856; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4 2857; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 2858; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6 2859; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 2860; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8 2861; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 2862; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 2863; GFX7-NEXT: s_waitcnt vmcnt(1) 2864; GFX7-NEXT: v_ashrrev_i32_e32 v10, 28, v0 2865; GFX7-NEXT: v_bfe_i32 v11, v0, 24, 4 2866; GFX7-NEXT: v_bfe_i32 v12, v0, 20, 4 2867; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4 2868; GFX7-NEXT: v_bfe_i32 v14, v0, 12, 4 2869; GFX7-NEXT: v_bfe_i32 v15, v0, 8, 4 2870; GFX7-NEXT: v_bfe_i32 v16, v0, 4, 4 2871; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 4 2872; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 2873; GFX7-NEXT: v_or_b32_e32 v4, v6, v5 2874; GFX7-NEXT: v_or_b32_e32 v5, v8, v7 2875; GFX7-NEXT: v_or_b32_e32 v2, v2, v9 2876; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v10 2877; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v11 2878; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v12 2879; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v13 2880; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v14 2881; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v15 2882; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v16 2883; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 2884; GFX7-NEXT: v_or_b32_e32 v6, v7, v6 2885; GFX7-NEXT: v_or_b32_e32 v7, v9, v8 2886; GFX7-NEXT: v_or_b32_e32 v8, v11, v10 2887; GFX7-NEXT: v_or_b32_e32 v0, v0, v12 2888; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 2889; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 2890; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 2891; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 2892; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v3 2893; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 2894; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 2895; GFX7-NEXT: v_or_b32_e32 v0, v0, v8 2896; GFX7-NEXT: v_or_b32_e32 v4, v4, v13 2897; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v2 2898; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v0 2899; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 2900; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 2901; GFX7-NEXT: v_bfe_u32 v9, v2, 8, 8 2902; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 8 2903; GFX7-NEXT: s_waitcnt vmcnt(0) 2904; GFX7-NEXT: v_mad_u32_u24 v1, v8, v13, v1 2905; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 2906; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 2907; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 2908; GFX7-NEXT: v_lshrrev_b32_e32 v12, 24, v0 2909; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 2910; GFX7-NEXT: v_mad_u32_u24 v1, v9, v14, v1 2911; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 2912; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v4 2913; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v5 2914; GFX7-NEXT: v_mad_u32_u24 v0, v7, v12, v0 2915; GFX7-NEXT: v_bfe_u32 v11, v4, 8, 8 2916; GFX7-NEXT: v_bfe_u32 v16, v5, 8, 8 2917; GFX7-NEXT: v_mad_u32_u24 v0, v10, v15, v0 2918; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 2919; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 2920; GFX7-NEXT: v_mad_u32_u24 v0, v11, v16, v0 2921; GFX7-NEXT: v_bfe_u32 v3, v3, 8, 8 2922; GFX7-NEXT: v_bfe_u32 v6, v6, 8, 8 2923; GFX7-NEXT: v_mad_u32_u24 v0, v4, v5, v0 2924; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0 2925; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 2926; GFX7-NEXT: s_endpgm 2927; 2928; GFX8-LABEL: idot8_acc8_vecMul: 2929; GFX8: ; %bb.0: ; %entry 2930; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2931; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2932; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2933; GFX8-NEXT: v_mov_b32_e32 v5, 12 2934; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2935; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2936; GFX8-NEXT: v_mov_b32_e32 v1, s5 2937; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 2938; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2939; GFX8-NEXT: flat_load_dword v3, v[0:1] 2940; GFX8-NEXT: v_mov_b32_e32 v1, s7 2941; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 2942; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2943; GFX8-NEXT: flat_load_dword v2, v[0:1] 2944; GFX8-NEXT: v_mov_b32_e32 v0, s0 2945; GFX8-NEXT: v_mov_b32_e32 v1, s1 2946; GFX8-NEXT: flat_load_ubyte v4, v[0:1] 2947; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2948; GFX8-NEXT: s_mov_b32 s10, -1 2949; GFX8-NEXT: s_mov_b32 s11, 0xe80000 2950; GFX8-NEXT: s_add_u32 s8, s8, s3 2951; GFX8-NEXT: s_addc_u32 s9, s9, 0 2952; GFX8-NEXT: s_waitcnt vmcnt(2) 2953; GFX8-NEXT: v_lshrrev_b32_e32 v6, 20, v3 2954; GFX8-NEXT: v_lshrrev_b32_e32 v7, 28, v3 2955; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3 2956; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 2957; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 2958; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3 2959; GFX8-NEXT: s_waitcnt vmcnt(1) 2960; GFX8-NEXT: v_lshrrev_b32_e32 v11, 20, v2 2961; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 2962; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2 2963; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2 2964; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2 2965; GFX8-NEXT: v_lshlrev_b16_sdwa v17, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2966; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2967; GFX8-NEXT: v_lshlrev_b16_e32 v18, 12, v2 2968; GFX8-NEXT: v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2969; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2970; GFX8-NEXT: v_lshlrev_b16_e32 v5, 12, v10 2971; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v16 2972; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v17 2973; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7 2974; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v3 2975; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v6 2976; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v15 2977; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v18 2978; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v19 2979; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 2980; GFX8-NEXT: v_ashrrev_i16_e32 v19, 12, v2 2981; GFX8-NEXT: v_lshlrev_b16_e32 v2, 12, v11 2982; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9 2983; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8 2984; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14 2985; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13 2986; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 2987; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 2988; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 2989; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2 2990; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 2991; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 2992; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v14 2993; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 2994; GFX8-NEXT: v_mul_lo_u16_e32 v10, v10, v15 2995; GFX8-NEXT: v_mul_lo_u16_e32 v15, v16, v18 2996; GFX8-NEXT: v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2997; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2998; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 2999; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 3000; GFX8-NEXT: v_mul_lo_u16_e32 v14, v17, v19 3001; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3002; GFX8-NEXT: v_mul_lo_u16_e32 v8, v9, v11 3003; GFX8-NEXT: v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3004; GFX8-NEXT: v_mul_lo_u16_sdwa v5, v5, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3005; GFX8-NEXT: v_or_b32_sdwa v6, v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3006; GFX8-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3007; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3 3008; GFX8-NEXT: v_or_b32_sdwa v8, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3009; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7 3010; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 3011; GFX8-NEXT: v_or_b32_sdwa v3, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 3012; GFX8-NEXT: v_or_b32_e32 v5, v5, v2 3013; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v3 3014; GFX8-NEXT: v_lshrrev_b64 v[2:3], 24, v[2:3] 3015; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v5 3016; GFX8-NEXT: s_waitcnt vmcnt(0) 3017; GFX8-NEXT: v_add_u16_e32 v3, v8, v4 3018; GFX8-NEXT: v_add_u16_e32 v3, v3, v5 3019; GFX8-NEXT: v_add_u16_e32 v3, v3, v7 3020; GFX8-NEXT: v_add_u16_e32 v2, v3, v2 3021; GFX8-NEXT: v_mad_u16 v2, v17, v19, v2 3022; GFX8-NEXT: v_add_u16_e32 v2, v2, v6 3023; GFX8-NEXT: v_mad_u16 v2, v16, v18, v2 3024; GFX8-NEXT: v_add_u16_e32 v2, v2, v10 3025; GFX8-NEXT: flat_store_byte v[0:1], v2 3026; GFX8-NEXT: s_endpgm 3027; 3028; GFX9-LABEL: idot8_acc8_vecMul: 3029; GFX9: ; %bb.0: ; %entry 3030; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 3031; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 3032; GFX9-NEXT: s_mov_b32 s10, -1 3033; GFX9-NEXT: s_mov_b32 s11, 0xe00000 3034; GFX9-NEXT: s_add_u32 s8, s8, s3 3035; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 3036; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 3037; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3038; GFX9-NEXT: v_mov_b32_e32 v3, 0 3039; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3040; GFX9-NEXT: global_load_dword v1, v0, s[4:5] 3041; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 3042; GFX9-NEXT: global_load_ubyte v4, v3, s[2:3] 3043; GFX9-NEXT: v_mov_b32_e32 v0, 12 3044; GFX9-NEXT: s_addc_u32 s9, s9, 0 3045; GFX9-NEXT: s_waitcnt vmcnt(2) 3046; GFX9-NEXT: v_lshrrev_b32_e32 v5, 20, v1 3047; GFX9-NEXT: v_lshrrev_b32_e32 v6, 28, v1 3048; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1 3049; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 3050; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 3051; GFX9-NEXT: s_waitcnt vmcnt(1) 3052; GFX9-NEXT: v_lshrrev_b32_e32 v10, 20, v2 3053; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 3054; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2 3055; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v1 3056; GFX9-NEXT: v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 3057; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3058; GFX9-NEXT: v_lshlrev_b16_e32 v17, 12, v2 3059; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 3060; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3061; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2 3062; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 3063; GFX9-NEXT: v_lshlrev_b16_e32 v2, 12, v9 3064; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v15 3065; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v16 3066; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6 3067; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v1 3068; GFX9-NEXT: v_lshlrev_b16_e32 v1, 12, v5 3069; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v14 3070; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v17 3071; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v18 3072; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11 3073; GFX9-NEXT: v_ashrrev_i16_e32 v18, 12, v0 3074; GFX9-NEXT: v_lshlrev_b16_e32 v0, 12, v10 3075; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8 3076; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7 3077; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13 3078; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12 3079; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 3080; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1 3081; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 3082; GFX9-NEXT: v_ashrrev_i16_e32 v0, 12, v0 3083; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 3084; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 3085; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v13 3086; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 3087; GFX9-NEXT: v_mul_lo_u16_e32 v19, v15, v17 3088; GFX9-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3089; GFX9-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3090; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 3091; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 3092; GFX9-NEXT: v_mul_lo_u16_e32 v13, v16, v18 3093; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3094; GFX9-NEXT: v_mul_lo_u16_e32 v7, v8, v10 3095; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3096; GFX9-NEXT: v_mul_lo_u16_e32 v9, v9, v14 3097; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3098; GFX9-NEXT: v_or_b32_sdwa v5, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3099; GFX9-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3100; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v1 3101; GFX9-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3102; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v6 3103; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v1 3104; GFX9-NEXT: v_or_b32_sdwa v1, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 3105; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 3106; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 3107; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] 3108; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v2 3109; GFX9-NEXT: s_waitcnt vmcnt(0) 3110; GFX9-NEXT: v_add_u16_e32 v1, v7, v4 3111; GFX9-NEXT: v_add_u16_e32 v1, v1, v2 3112; GFX9-NEXT: v_add_u16_e32 v1, v1, v6 3113; GFX9-NEXT: v_add_u16_e32 v0, v1, v0 3114; GFX9-NEXT: v_mad_legacy_u16 v0, v16, v18, v0 3115; GFX9-NEXT: v_add_u16_e32 v0, v0, v5 3116; GFX9-NEXT: v_mad_legacy_u16 v0, v15, v17, v0 3117; GFX9-NEXT: v_add_u16_e32 v0, v0, v9 3118; GFX9-NEXT: global_store_byte v3, v0, s[2:3] 3119; GFX9-NEXT: s_endpgm 3120; 3121; GFX9-DL-LABEL: idot8_acc8_vecMul: 3122; GFX9-DL: ; %bb.0: ; %entry 3123; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 3124; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 3125; GFX9-DL-NEXT: s_mov_b32 s10, -1 3126; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 3127; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 3128; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 3129; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 3130; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3131; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0 3132; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 3133; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 3134; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 3135; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[2:3] 3136; GFX9-DL-NEXT: v_mov_b32_e32 v0, 12 3137; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 3138; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 3139; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v1 3140; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 3141; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1 3142; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 3143; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 3144; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 3145; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 20, v2 3146; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 3147; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2 3148; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v1 3149; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 3150; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3151; GFX9-DL-NEXT: v_lshlrev_b16_e32 v17, 12, v2 3152; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 3153; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3154; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2 3155; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 3156; GFX9-DL-NEXT: v_lshlrev_b16_e32 v2, 12, v9 3157; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v15 3158; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v16 3159; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6 3160; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v1 3161; GFX9-DL-NEXT: v_lshlrev_b16_e32 v1, 12, v5 3162; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v14 3163; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v17 3164; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18 3165; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11 3166; GFX9-DL-NEXT: v_ashrrev_i16_e32 v18, 12, v0 3167; GFX9-DL-NEXT: v_lshlrev_b16_e32 v0, 12, v10 3168; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8 3169; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7 3170; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13 3171; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12 3172; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 3173; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1 3174; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 3175; GFX9-DL-NEXT: v_ashrrev_i16_e32 v0, 12, v0 3176; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 3177; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 3178; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v13 3179; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 3180; GFX9-DL-NEXT: v_mul_lo_u16_e32 v19, v15, v17 3181; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3182; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3183; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 3184; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 3185; GFX9-DL-NEXT: v_mul_lo_u16_e32 v13, v16, v18 3186; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3187; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v8, v10 3188; GFX9-DL-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3189; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, v9, v14 3190; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3191; GFX9-DL-NEXT: v_or_b32_sdwa v5, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3192; GFX9-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3193; GFX9-DL-NEXT: v_lshlrev_b32_e32 v8, 16, v1 3194; GFX9-DL-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3195; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6 3196; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v1 3197; GFX9-DL-NEXT: v_or_b32_sdwa v1, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 3198; GFX9-DL-NEXT: v_or_b32_e32 v2, v2, v0 3199; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 3200; GFX9-DL-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] 3201; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v2 3202; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 3203; GFX9-DL-NEXT: v_add_u16_e32 v1, v7, v4 3204; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v2 3205; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 3206; GFX9-DL-NEXT: v_add_u16_e32 v0, v1, v0 3207; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v16, v18, v0 3208; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v5 3209; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v15, v17, v0 3210; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v9 3211; GFX9-DL-NEXT: global_store_byte v3, v0, s[2:3] 3212; GFX9-DL-NEXT: s_endpgm 3213; 3214; GFX10-DL-XNACK-LABEL: idot8_acc8_vecMul: 3215; GFX10-DL-XNACK: ; %bb.0: ; %entry 3216; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 3217; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3218; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 3219; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0 3220; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 3221; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 3222; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 3223; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 3224; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 3225; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 3226; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) 3227; GFX10-DL-XNACK-NEXT: s_clause 0x1 3228; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] 3229; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] 3230; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v4, s[0:1] 3231; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) 3232; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 3233; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) 3234; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2 3235; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 3236; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 8, v2 3237; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 3238; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 3239; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15 3240; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v2 3241; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 3242; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 4, v2 3243; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 3244; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 3245; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 3246; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v15 3247; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v0, 20, v1 3248; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1 3249; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v2 3250; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v2 3251; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 3252; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v13 3253; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 3254; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v17 3255; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 3256; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16 3257; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v8, v8, v15 3258; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1 3259; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v2 3260; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 3261; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v0, 12, v0 3262; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14 3263; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v11 3264; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 3265; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v13, 12, v13 3266; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 3267; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 3268; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 3269; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v17 3270; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v9, v16 3271; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 8, v8 3272; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 3273; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v12 3274; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 3275; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v0, 12, v0 3276; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v14 3277; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11 3278; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v6, v6, v13 3279; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 3280; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 3281; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v10, v10, v15 3282; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3283; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 3284; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12 3285; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v0, v11 3286; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v11, v7, v14 3287; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 8, v6 3288; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 8, v10 3289; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8 3290; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v1, v1, v2 3291; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v2, v5, v12 3292; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 8, v9 3293; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v6, v11, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3294; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v11, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 3295; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3296; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3297; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v9, 16, v6 3298; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v11 3299; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) 3300; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v1, v3 3301; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 3302; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v9, v3, v10 3303; GFX10-DL-XNACK-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] 3304; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v1 3305; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v9, v8 3306; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v2 3307; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v5, v12, v0 3308; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v1 3309; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6 3310; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v7, v14, v0 3311; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v1 3312; GFX10-DL-XNACK-NEXT: global_store_byte v4, v0, s[0:1] 3313; GFX10-DL-XNACK-NEXT: s_endpgm 3314; 3315; GFX10-DL-NOXNACK-LABEL: idot8_acc8_vecMul: 3316; GFX10-DL-NOXNACK: ; %bb.0: ; %entry 3317; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 3318; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3319; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 3320; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0 3321; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 3322; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 3323; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 3324; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 3325; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 3326; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 3327; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) 3328; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 3329; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] 3330; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] 3331; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v2, v4, s[0:1] 3332; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) 3333; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 3334; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) 3335; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0 3336; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 3337; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v0 3338; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v0 3339; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v0 3340; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v0 3341; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 8, v0 3342; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 4, v0 3343; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 3344; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 3345; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15 3346; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 3347; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 3348; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 3349; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v18, 12, v0 3350; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v16 3351; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 3352; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v15 3353; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 20, v1 3354; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1 3355; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 3356; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13 3357; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 3358; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v17 3359; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 3360; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 3361; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v8, v8, v15 3362; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1 3363; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 3364; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 12, v3 3365; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14 3366; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11 3367; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 3368; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v13 3369; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 3370; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 3371; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v17 3372; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v0, v9, v0 3373; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 8, v8 3374; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 3375; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v12 3376; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 3377; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v3 3378; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v14 3379; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v11 3380; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v6, v6, v13 3381; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 3382; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v10, v10, v15 3383; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3384; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 3385; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v12 3386; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v3, v3, v9 3387; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v9, v7, v14 3388; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 8, v6 3389; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 8, v10 3390; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8 3391; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v1, v1, v18 3392; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v12, v5, v11 3393; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 8, v3 3394; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3395; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v9, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 3396; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3397; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3398; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v10, 16, v6 3399; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v9 3400; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) 3401; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v2, v1, v2 3402; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v3, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 3403; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v9, v2, v9 3404; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] 3405; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v1 3406; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v9, v8 3407; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v2 3408; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v5, v11, v0 3409; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1 3410; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6 3411; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v7, v14, v0 3412; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1 3413; GFX10-DL-NOXNACK-NEXT: global_store_byte v4, v0, s[0:1] 3414; GFX10-DL-NOXNACK-NEXT: s_endpgm 3415; GFX10-DL-LABEL: idot8_acc8_vecMul: 3416; GFX10-DL: ; %bb.0: ; %entry 3417; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 3418; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 3419; GFX10-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 3420; GFX10-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 3421; GFX10-DL-NEXT: s_mov_b32 s22, -1 3422; GFX10-DL-NEXT: s_mov_b32 s23, 0x31c16000 3423; GFX10-DL-NEXT: s_add_u32 s20, s20, s3 3424; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3425; GFX10-DL-NEXT: s_addc_u32 s21, s21, 0 3426; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 3427; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] 3428; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 3429; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 3430; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff 3431; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 3432; GFX10-DL-NEXT: s_lshr_b32 s9, s0, 4 3433; GFX10-DL-NEXT: s_lshr_b32 s16, s1, 4 3434; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s9 3435; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s16 3436; GFX10-DL-NEXT: s_lshr_b32 s10, s0, 12 3437; GFX10-DL-NEXT: s_lshr_b32 s17, s1, 12 3438; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s0 3439; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s1 3440; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s17 3441; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s10 3442; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v6 3443; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v12 3444; GFX10-DL-NEXT: s_lshr_b32 s11, s0, 8 3445; GFX10-DL-NEXT: s_lshr_b32 s18, s1, 8 3446; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s11 3447; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s18 3448; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2 3449; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 3450; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v5 3451; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, v6, v12 3452; GFX10-DL-NEXT: v_ashrrev_i16_e64 v13, 12, v13 3453; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 3454; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v11 3455; GFX10-DL-NEXT: v_mul_lo_u16_e64 v2, v2, v3 3456; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v6 3457; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v19, v13 3458; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 20 3459; GFX10-DL-NEXT: s_lshr_b32 s6, s0, 16 3460; GFX10-DL-NEXT: s_lshr_b32 s7, s0, 28 3461; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 24 3462; GFX10-DL-NEXT: s_lshr_b32 s12, s1, 20 3463; GFX10-DL-NEXT: v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3464; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s8 3465; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s7 3466; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s6 3467; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s3 3468; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s12 3469; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v11 3470; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 3471; GFX10-DL-NEXT: s_lshr_b32 s13, s1, 16 3472; GFX10-DL-NEXT: s_lshr_b32 s14, s1, 28 3473; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s13 3474; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v7 3475; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v8 3476; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v9 3477; GFX10-DL-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3478; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2 3479; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s14 3480; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v10 3481; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v12 3482; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 24 3483; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v6 3484; GFX10-DL-NEXT: v_ashrrev_i16_e64 v10, 12, v15 3485; GFX10-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s15 3486; GFX10-DL-NEXT: v_or_b32_e32 v3, v2, v3 3487; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v9 3488; GFX10-DL-NEXT: v_mul_lo_u16_e64 v15, v8, v6 3489; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v7, v10 3490; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v14 3491; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v3 3492; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 3493; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1 3494; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 8, v4 3495; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v5, v11 3496; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v7 3497; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v8 3498; GFX10-DL-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3499; GFX10-DL-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3500; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 3501; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2 3502; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 3503; GFX10-DL-NEXT: v_or_b32_e32 v3, v2, v4 3504; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 3505; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v3 3506; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 3507; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3508; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 3509; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] 3510; GFX10-DL-NEXT: s_endpgm 3511 <8 x i4> addrspace(1)* %src2, 3512 i8 addrspace(1)* nocapture %dst) { 3513entry: 3514 %idx = call i32 @llvm.amdgcn.workitem.id.x() 3515 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx 3516 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1 3517 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx 3518 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2 3519 3520 %cvec1 = sext <8 x i4> %vec1 to <8 x i8> 3521 %cvec2 = sext <8 x i4> %vec2 to <8 x i8> 3522 3523 %mul = mul <8 x i8> %cvec1, %cvec2 3524 %mul0 = extractelement <8 x i8> %mul, i64 0 3525 %mul1 = extractelement <8 x i8> %mul, i64 1 3526 %mul2 = extractelement <8 x i8> %mul, i64 2 3527 %mul3 = extractelement <8 x i8> %mul, i64 3 3528 %mul4 = extractelement <8 x i8> %mul, i64 4 3529 %mul5 = extractelement <8 x i8> %mul, i64 5 3530 %mul6 = extractelement <8 x i8> %mul, i64 6 3531 %mul7 = extractelement <8 x i8> %mul, i64 7 3532 3533 %acc = load i8, i8 addrspace(1)* %dst, align 4 3534 %add1 = add i8 %mul0, %acc 3535 %add2 = add i8 %add1, %mul1 3536 %add3 = add i8 %add2, %mul2 3537 %add4 = add i8 %add3, %mul3 3538 %add5 = add i8 %add4, %mul4 3539 %add6 = add i8 %add5, %mul5 3540 %add7 = add i8 %add6, %mul6 3541 %add8 = add i8 %add7, %mul7 3542 3543 store i8 %add8, i8 addrspace(1)* %dst, align 4 3544 ret void 3545} 3546 3547declare i32 @llvm.amdgcn.workitem.id.x() 3548