1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-NODL %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s 7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s 8 9define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1, 10; GFX7-LABEL: udot4_acc32: 11; GFX7: ; %bb.0: ; %entry 12; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 13; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 14; GFX7-NEXT: s_mov_b32 s3, 0xf000 15; GFX7-NEXT: s_mov_b32 s10, 0 16; GFX7-NEXT: s_mov_b32 s11, s3 17; GFX7-NEXT: s_waitcnt lgkmcnt(0) 18; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 19; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 20; GFX7-NEXT: v_mov_b32_e32 v1, 0 21; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 22; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 23; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 24; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 25; GFX7-NEXT: s_mov_b32 s2, -1 26; GFX7-NEXT: s_waitcnt vmcnt(1) 27; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 28; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 29; GFX7-NEXT: s_waitcnt vmcnt(0) 30; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 31; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 32; GFX7-NEXT: s_waitcnt lgkmcnt(0) 33; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, s4 34; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 35; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 36; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 37; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 38; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 39; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 40; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 41; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 42; GFX7-NEXT: s_endpgm 43; 44; GFX8-LABEL: udot4_acc32: 45; GFX8: ; %bb.0: ; %entry 46; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 47; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 48; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 49; GFX8-NEXT: s_waitcnt lgkmcnt(0) 50; GFX8-NEXT: v_mov_b32_e32 v1, s5 51; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 52; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 53; GFX8-NEXT: flat_load_dword v3, v[0:1] 54; GFX8-NEXT: v_mov_b32_e32 v1, s7 55; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 56; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 57; GFX8-NEXT: flat_load_dword v0, v[0:1] 58; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 59; GFX8-NEXT: s_waitcnt vmcnt(1) 60; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 61; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 62; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 8 63; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 64; GFX8-NEXT: s_waitcnt vmcnt(0) 65; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 66; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 67; GFX8-NEXT: s_waitcnt lgkmcnt(0) 68; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2 69; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8 70; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1 71; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 72; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1 73; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 74; GFX8-NEXT: v_mov_b32_e32 v0, s0 75; GFX8-NEXT: v_mov_b32_e32 v1, s1 76; GFX8-NEXT: flat_store_dword v[0:1], v2 77; GFX8-NEXT: s_endpgm 78; 79; GFX9-NODL-LABEL: udot4_acc32: 80; GFX9-NODL: ; %bb.0: ; %entry 81; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 82; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 83; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 84; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 85; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 86; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 87; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 88; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 89; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 90; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 91; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 92; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 93; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 94; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 95; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4 96; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 97; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 98; GFX9-NODL-NEXT: s_endpgm 99; 100; GFX9-DL-LABEL: udot4_acc32: 101; GFX9-DL: ; %bb.0: ; %entry 102; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 103; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 104; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 105; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 106; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 107; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 108; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 109; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 110; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 111; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 112; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 113; GFX9-DL-NEXT: s_endpgm 114; 115; GFX10-DL-LABEL: udot4_acc32: 116; GFX10-DL: ; %bb.0: ; %entry 117; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 118; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 119; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 120; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 121; GFX10-DL-NEXT: s_clause 0x1 122; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 123; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 124; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 125; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 126; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 127; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 128; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] 129; GFX10-DL-NEXT: s_endpgm 130 <4 x i8> addrspace(1)* %src2, 131 i32 addrspace(1)* nocapture %dst) { 132entry: 133 %idx = call i32 @llvm.amdgcn.workitem.id.x() 134 %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx 135 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1 136 %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx 137 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2 138 139 %v1e0 = extractelement <4 x i8> %vec1, i64 0 140 %cv1e0 = zext i8 %v1e0 to i32 141 %v2e0 = extractelement <4 x i8> %vec2, i64 0 142 %cv2e0 = zext i8 %v2e0 to i32 143 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 144 145 %v1e1 = extractelement <4 x i8> %vec1, i64 1 146 %cv1e1 = zext i8 %v1e1 to i32 147 %v2e1 = extractelement <4 x i8> %vec2, i64 1 148 %cv2e1 = zext i8 %v2e1 to i32 149 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 150 151 %v1e2 = extractelement <4 x i8> %vec1, i64 2 152 %cv1e2 = zext i8 %v1e2 to i32 153 %v2e2 = extractelement <4 x i8> %vec2, i64 2 154 %cv2e2 = zext i8 %v2e2 to i32 155 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 156 157 %v1e3 = extractelement <4 x i8> %vec1, i64 3 158 %cv1e3 = zext i8 %v1e3 to i32 159 %v2e3 = extractelement <4 x i8> %vec2, i64 3 160 %cv2e3 = zext i8 %v2e3 to i32 161 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 162 163 %acc = load i32, i32 addrspace(1)* %dst, align 4 164 %mad1 = add i32 %mul1, %acc 165 %mad2 = add i32 %mad1, %mul2 166 %mad3 = add i32 %mad2, %mul3 167 %mad4 = add i32 %mad3, %mul4 168 169 store i32 %mad4, i32 addrspace(1)* %dst, align 4 170 ret void 171} 172 173define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1, 174; GFX7-LABEL: udot4_acc16: 175; GFX7: ; %bb.0: ; %entry 176; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 177; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 178; GFX7-NEXT: s_mov_b32 s3, 0xf000 179; GFX7-NEXT: s_mov_b32 s10, 0 180; GFX7-NEXT: s_mov_b32 s11, s3 181; GFX7-NEXT: s_waitcnt lgkmcnt(0) 182; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 183; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 184; GFX7-NEXT: v_mov_b32_e32 v1, 0 185; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 186; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 187; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 188; GFX7-NEXT: s_mov_b32 s2, -1 189; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 190; GFX7-NEXT: s_waitcnt vmcnt(2) 191; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 192; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 193; GFX7-NEXT: s_waitcnt vmcnt(1) 194; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 195; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 196; GFX7-NEXT: s_waitcnt vmcnt(0) 197; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 198; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 199; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 200; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 201; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 202; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 203; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 204; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 205; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 206; GFX7-NEXT: s_endpgm 207; 208; GFX8-LABEL: udot4_acc16: 209; GFX8: ; %bb.0: ; %entry 210; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 211; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 212; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 213; GFX8-NEXT: v_mov_b32_e32 v5, 0xff 214; GFX8-NEXT: s_waitcnt lgkmcnt(0) 215; GFX8-NEXT: v_mov_b32_e32 v1, s5 216; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 217; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 218; GFX8-NEXT: flat_load_dword v3, v[0:1] 219; GFX8-NEXT: v_mov_b32_e32 v1, s7 220; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 221; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 222; GFX8-NEXT: flat_load_dword v2, v[0:1] 223; GFX8-NEXT: v_mov_b32_e32 v0, s0 224; GFX8-NEXT: v_mov_b32_e32 v1, s1 225; GFX8-NEXT: flat_load_ushort v4, v[0:1] 226; GFX8-NEXT: s_waitcnt vmcnt(2) 227; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v3 228; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 229; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8 230; GFX8-NEXT: v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 231; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 232; GFX8-NEXT: s_waitcnt vmcnt(1) 233; GFX8-NEXT: v_and_b32_e32 v7, 0xff, v2 234; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2 235; GFX8-NEXT: v_and_b32_e32 v9, 0xff, v9 236; GFX8-NEXT: s_waitcnt vmcnt(0) 237; GFX8-NEXT: v_mad_u16 v4, v6, v7, v4 238; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 239; GFX8-NEXT: v_mad_u16 v4, v8, v9, v4 240; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 241; GFX8-NEXT: v_mad_u16 v4, v10, v5, v4 242; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 243; GFX8-NEXT: flat_store_short v[0:1], v2 244; GFX8-NEXT: s_endpgm 245; 246; GFX9-NODL-LABEL: udot4_acc16: 247; GFX9-NODL: ; %bb.0: ; %entry 248; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 249; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 250; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 251; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff 252; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 253; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 254; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 255; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 256; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] 257; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 258; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v1 259; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 260; GFX9-NODL-NEXT: v_and_b32_e32 v5, 0xff, v2 261; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 262; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 263; GFX9-NODL-NEXT: v_and_b32_e32 v6, 0xff, v6 264; GFX9-NODL-NEXT: v_and_b32_e32 v7, 0xff, v7 265; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 266; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 267; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 268; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 269; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3 270; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 271; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 272; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v8, v9, v3 273; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 274; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] 275; GFX9-NODL-NEXT: s_endpgm 276; 277; GFX9-DL-LABEL: udot4_acc16: 278; GFX9-DL: ; %bb.0: ; %entry 279; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 280; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 281; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 282; GFX9-DL-NEXT: s_movk_i32 s0, 0xff 283; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 284; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 285; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 286; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 287; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] 288; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 289; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xff, v1 290; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 291; GFX9-DL-NEXT: v_and_b32_e32 v5, 0xff, v2 292; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 293; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 294; GFX9-DL-NEXT: v_and_b32_e32 v6, 0xff, v6 295; GFX9-DL-NEXT: v_and_b32_e32 v7, 0xff, v7 296; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 297; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 298; GFX9-DL-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 299; GFX9-DL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 300; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3 301; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 302; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 303; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v9, v3 304; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 305; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] 306; GFX9-DL-NEXT: s_endpgm 307; 308; GFX10-DL-LABEL: udot4_acc16: 309; GFX10-DL: ; %bb.0: ; %entry 310; GFX10-DL-NEXT: s_clause 0x1 311; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 312; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 313; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 314; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0xff 315; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 316; GFX10-DL-NEXT: s_clause 0x1 317; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 318; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 319; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 320; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3] 321; GFX10-DL-NEXT: s_waitcnt vmcnt(2) 322; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xff, v1 323; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 324; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 325; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 326; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xff, v2 327; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xff, v5 328; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xff, v6 329; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 330; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3 331; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 332; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 333; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 334; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 335; GFX10-DL-NEXT: v_mad_u16 v3, v5, v6, v3 336; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3 337; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 338; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3] 339; GFX10-DL-NEXT: s_endpgm 340 <4 x i8> addrspace(1)* %src2, 341 i16 addrspace(1)* nocapture %dst) { 342entry: 343 %idx = call i32 @llvm.amdgcn.workitem.id.x() 344 %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx 345 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1 346 %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx 347 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2 348 349 %v1e0 = extractelement <4 x i8> %vec1, i64 0 350 %cv1e0 = zext i8 %v1e0 to i16 351 %v2e0 = extractelement <4 x i8> %vec2, i64 0 352 %cv2e0 = zext i8 %v2e0 to i16 353 %mul1 = mul nuw nsw i16 %cv1e0, %cv2e0 354 355 %v1e1 = extractelement <4 x i8> %vec1, i64 1 356 %cv1e1 = zext i8 %v1e1 to i16 357 %v2e1 = extractelement <4 x i8> %vec2, i64 1 358 %cv2e1 = zext i8 %v2e1 to i16 359 %mul2 = mul nuw nsw i16 %cv1e1, %cv2e1 360 361 %v1e2 = extractelement <4 x i8> %vec1, i64 2 362 %cv1e2 = zext i8 %v1e2 to i16 363 %v2e2 = extractelement <4 x i8> %vec2, i64 2 364 %cv2e2 = zext i8 %v2e2 to i16 365 %mul3 = mul nuw nsw i16 %cv1e2, %cv2e2 366 367 %v1e3 = extractelement <4 x i8> %vec1, i64 3 368 %cv1e3 = zext i8 %v1e3 to i16 369 %v2e3 = extractelement <4 x i8> %vec2, i64 3 370 %cv2e3 = zext i8 %v2e3 to i16 371 %mul4 = mul nuw nsw i16 %cv1e3, %cv2e3 372 373 %acc = load i16, i16 addrspace(1)* %dst, align 2 374 %mad1 = add i16 %mul1, %acc 375 %mad2 = add i16 %mad1, %mul2 376 %mad3 = add i16 %mad2, %mul3 377 %mad4 = add i16 %mad3, %mul4 378 379 store i16 %mad4, i16 addrspace(1)* %dst, align 2 380 ret void 381} 382 383define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1, 384; GFX7-LABEL: udot4_acc8: 385; GFX7: ; %bb.0: ; %entry 386; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 387; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 388; GFX7-NEXT: s_mov_b32 s3, 0xf000 389; GFX7-NEXT: s_mov_b32 s10, 0 390; GFX7-NEXT: s_mov_b32 s11, s3 391; GFX7-NEXT: s_waitcnt lgkmcnt(0) 392; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 393; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 394; GFX7-NEXT: v_mov_b32_e32 v1, 0 395; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 396; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 397; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 398; GFX7-NEXT: s_mov_b32 s2, -1 399; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 400; GFX7-NEXT: s_waitcnt vmcnt(2) 401; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 402; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 403; GFX7-NEXT: s_waitcnt vmcnt(1) 404; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 405; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 406; GFX7-NEXT: s_waitcnt vmcnt(0) 407; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 408; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 409; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 410; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 411; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 412; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 413; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 414; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 415; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 416; GFX7-NEXT: s_endpgm 417; 418; GFX8-LABEL: udot4_acc8: 419; GFX8: ; %bb.0: ; %entry 420; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 421; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 422; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 423; GFX8-NEXT: s_waitcnt lgkmcnt(0) 424; GFX8-NEXT: v_mov_b32_e32 v1, s5 425; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 426; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 427; GFX8-NEXT: flat_load_dword v3, v[0:1] 428; GFX8-NEXT: v_mov_b32_e32 v1, s7 429; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 430; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 431; GFX8-NEXT: flat_load_dword v2, v[0:1] 432; GFX8-NEXT: v_mov_b32_e32 v0, s0 433; GFX8-NEXT: v_mov_b32_e32 v1, s1 434; GFX8-NEXT: flat_load_ubyte v4, v[0:1] 435; GFX8-NEXT: s_waitcnt vmcnt(2) 436; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 437; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 438; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 439; GFX8-NEXT: s_waitcnt vmcnt(1) 440; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 441; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 442; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2 443; GFX8-NEXT: s_waitcnt vmcnt(0) 444; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 445; GFX8-NEXT: v_mad_u16 v2, v7, v8, v2 446; GFX8-NEXT: v_mad_u16 v2, v5, v6, v2 447; GFX8-NEXT: v_mad_u16 v2, v9, v10, v2 448; GFX8-NEXT: flat_store_byte v[0:1], v2 449; GFX8-NEXT: s_endpgm 450; 451; GFX9-NODL-LABEL: udot4_acc8: 452; GFX9-NODL: ; %bb.0: ; %entry 453; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 454; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 455; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 456; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 457; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 458; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 459; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 460; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3] 461; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 462; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 463; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 464; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 465; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 466; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v1 467; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 468; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 469; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 470; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v6, v7, v1 471; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 472; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 473; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v8, v9, v1 474; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] 475; GFX9-NODL-NEXT: s_endpgm 476; 477; GFX9-DL-LABEL: udot4_acc8: 478; GFX9-DL: ; %bb.0: ; %entry 479; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 480; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 481; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 482; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 483; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 484; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 485; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 486; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] 487; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 488; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 489; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 490; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 491; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 492; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v1 493; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 494; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 495; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 496; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v7, v1 497; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 498; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 499; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v9, v1 500; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] 501; GFX9-DL-NEXT: s_endpgm 502; 503; GFX10-DL-LABEL: udot4_acc8: 504; GFX10-DL: ; %bb.0: ; %entry 505; GFX10-DL-NEXT: s_clause 0x1 506; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 507; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 508; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 509; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 510; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 511; GFX10-DL-NEXT: s_clause 0x1 512; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] 513; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] 514; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] 515; GFX10-DL-NEXT: s_waitcnt vmcnt(2) 516; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2 517; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 518; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3 519; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 520; GFX10-DL-NEXT: v_mad_u16 v4, v2, v3, v4 521; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 522; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 523; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 524; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3 525; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 526; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 527; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 528; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] 529; GFX10-DL-NEXT: s_endpgm 530 <4 x i8> addrspace(1)* %src2, 531 i8 addrspace(1)* nocapture %dst) { 532entry: 533 %idx = call i32 @llvm.amdgcn.workitem.id.x() 534 %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx 535 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1 536 %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx 537 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2 538 539 %v1e0 = extractelement <4 x i8> %vec1, i64 0 540 %v2e0 = extractelement <4 x i8> %vec2, i64 0 541 %mul1 = mul nuw nsw i8 %v1e0, %v2e0 542 543 %v1e1 = extractelement <4 x i8> %vec1, i64 1 544 %v2e1 = extractelement <4 x i8> %vec2, i64 1 545 %mul2 = mul nuw nsw i8 %v1e1, %v2e1 546 547 %v1e2 = extractelement <4 x i8> %vec1, i64 2 548 %v2e2 = extractelement <4 x i8> %vec2, i64 2 549 %mul3 = mul nuw nsw i8 %v1e2, %v2e2 550 551 %v1e3 = extractelement <4 x i8> %vec1, i64 3 552 %v2e3 = extractelement <4 x i8> %vec2, i64 3 553 %mul4 = mul nuw nsw i8 %v1e3, %v2e3 554 555 %acc = load i8, i8 addrspace(1)* %dst, align 2 556 %mad1 = add i8 %mul1, %acc 557 %mad2 = add i8 %mad1, %mul2 558 %mad3 = add i8 %mad2, %mul3 559 %mad4 = add i8 %mad3, %mul4 560 561 store i8 %mad4, i8 addrspace(1)* %dst, align 2 562 ret void 563} 564 565; TODO: Generate udot4? 566define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1, 567; GFX7-LABEL: udot2_8: 568; GFX7: ; %bb.0: ; %entry 569; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 570; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 571; GFX7-NEXT: s_mov_b32 s3, 0xf000 572; GFX7-NEXT: s_mov_b32 s10, 0 573; GFX7-NEXT: s_mov_b32 s11, s3 574; GFX7-NEXT: s_waitcnt lgkmcnt(0) 575; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 576; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 577; GFX7-NEXT: v_mov_b32_e32 v1, 0 578; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 579; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 580; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 581; GFX7-NEXT: s_mov_b32 s2, -1 582; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 583; GFX7-NEXT: s_waitcnt vmcnt(2) 584; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 585; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8 586; GFX7-NEXT: s_waitcnt vmcnt(1) 587; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 588; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 589; GFX7-NEXT: s_waitcnt vmcnt(0) 590; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1 591; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 592; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 593; GFX7-NEXT: s_endpgm 594; 595; GFX8-LABEL: udot2_8: 596; GFX8: ; %bb.0: ; %entry 597; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 598; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 599; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 600; GFX8-NEXT: s_waitcnt lgkmcnt(0) 601; GFX8-NEXT: v_mov_b32_e32 v1, s5 602; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 603; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 604; GFX8-NEXT: flat_load_dword v3, v[0:1] 605; GFX8-NEXT: v_mov_b32_e32 v1, s7 606; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 607; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 608; GFX8-NEXT: flat_load_dword v2, v[0:1] 609; GFX8-NEXT: v_mov_b32_e32 v0, s0 610; GFX8-NEXT: v_mov_b32_e32 v1, s1 611; GFX8-NEXT: flat_load_ubyte v4, v[0:1] 612; GFX8-NEXT: s_waitcnt vmcnt(2) 613; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v3 614; GFX8-NEXT: s_waitcnt vmcnt(1) 615; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v2 616; GFX8-NEXT: s_waitcnt vmcnt(0) 617; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 618; GFX8-NEXT: v_mad_u16 v2, v5, v6, v2 619; GFX8-NEXT: flat_store_byte v[0:1], v2 620; GFX8-NEXT: s_endpgm 621; 622; GFX9-NODL-LABEL: udot2_8: 623; GFX9-NODL: ; %bb.0: ; %entry 624; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 625; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 626; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 627; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0 628; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 629; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5] 630; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7] 631; GFX9-NODL-NEXT: global_load_ubyte v4, v1, s[2:3] 632; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 633; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 8, v2 634; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 635; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 8, v3 636; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 637; GFX9-NODL-NEXT: v_mad_legacy_u16 v2, v2, v3, v4 638; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v5, v2 639; GFX9-NODL-NEXT: global_store_byte v1, v0, s[2:3] 640; GFX9-NODL-NEXT: s_endpgm 641; 642; GFX9-DL-LABEL: udot2_8: 643; GFX9-DL: ; %bb.0: ; %entry 644; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 645; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 646; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 647; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 648; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 649; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] 650; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] 651; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[2:3] 652; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 653; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2 654; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 655; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3 656; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 657; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v2, v3, v4 658; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v5, v2 659; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3] 660; GFX9-DL-NEXT: s_endpgm 661; 662; GFX10-DL-LABEL: udot2_8: 663; GFX10-DL: ; %bb.0: ; %entry 664; GFX10-DL-NEXT: s_clause 0x1 665; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 666; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 667; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 668; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 669; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 670; GFX10-DL-NEXT: s_clause 0x1 671; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] 672; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] 673; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] 674; GFX10-DL-NEXT: s_waitcnt vmcnt(2) 675; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2 676; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 677; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3 678; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 679; GFX10-DL-NEXT: v_mad_u16 v2, v2, v3, v4 680; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v2 681; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] 682; GFX10-DL-NEXT: s_endpgm 683 <4 x i8> addrspace(1)* %src2, 684 i8 addrspace(1)* nocapture %dst) { 685entry: 686 %idx = call i32 @llvm.amdgcn.workitem.id.x() 687 %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx 688 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1 689 %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx 690 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2 691 692 %v1e0 = extractelement <4 x i8> %vec1, i64 0 693 %v2e0 = extractelement <4 x i8> %vec2, i64 0 694 %mul1 = mul nuw nsw i8 %v1e0, %v2e0 695 696 %v1e1 = extractelement <4 x i8> %vec1, i64 1 697 %v2e1 = extractelement <4 x i8> %vec2, i64 1 698 %mul2 = mul nuw nsw i8 %v1e1, %v2e1 699 700 %acc = load i8, i8 addrspace(1)* %dst, align 2 701 %mad1 = add i8 %mul1, %acc 702 %mad2 = add i8 %mad1, %mul2 703 store i8 %mad2, i8 addrspace(1)* %dst, align 2 704 ret void 705} 706 707define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %src1, 708; GFX7-LABEL: udot4_CommutationInsideMAD: 709; GFX7: ; %bb.0: ; %entry 710; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 711; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 712; GFX7-NEXT: s_mov_b32 s3, 0xf000 713; GFX7-NEXT: s_mov_b32 s10, 0 714; GFX7-NEXT: s_mov_b32 s11, s3 715; GFX7-NEXT: s_waitcnt lgkmcnt(0) 716; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 717; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 718; GFX7-NEXT: v_mov_b32_e32 v1, 0 719; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 720; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 721; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 722; GFX7-NEXT: s_mov_b32 s2, -1 723; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 724; GFX7-NEXT: s_waitcnt vmcnt(2) 725; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 726; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 727; GFX7-NEXT: s_waitcnt vmcnt(1) 728; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 729; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 730; GFX7-NEXT: s_waitcnt vmcnt(0) 731; GFX7-NEXT: v_mad_u32_u24 v1, v6, v3, v1 732; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 733; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 734; GFX7-NEXT: v_mad_u32_u24 v1, v7, v4, v1 735; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 736; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 737; GFX7-NEXT: v_mad_u32_u24 v1, v8, v5, v1 738; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 739; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 740; GFX7-NEXT: s_endpgm 741; 742; GFX8-LABEL: udot4_CommutationInsideMAD: 743; GFX8: ; %bb.0: ; %entry 744; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 745; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 746; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 747; GFX8-NEXT: s_waitcnt lgkmcnt(0) 748; GFX8-NEXT: v_mov_b32_e32 v1, s5 749; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 750; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 751; GFX8-NEXT: flat_load_dword v3, v[0:1] 752; GFX8-NEXT: v_mov_b32_e32 v1, s7 753; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 754; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 755; GFX8-NEXT: flat_load_dword v2, v[0:1] 756; GFX8-NEXT: v_mov_b32_e32 v0, s0 757; GFX8-NEXT: v_mov_b32_e32 v1, s1 758; GFX8-NEXT: flat_load_ubyte v4, v[0:1] 759; GFX8-NEXT: s_waitcnt vmcnt(2) 760; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 761; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 762; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 763; GFX8-NEXT: s_waitcnt vmcnt(1) 764; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 765; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 766; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2 767; GFX8-NEXT: s_waitcnt vmcnt(0) 768; GFX8-NEXT: v_mad_u16 v2, v2, v3, v4 769; GFX8-NEXT: v_mad_u16 v2, v8, v7, v2 770; GFX8-NEXT: v_mad_u16 v2, v6, v5, v2 771; GFX8-NEXT: v_mad_u16 v2, v10, v9, v2 772; GFX8-NEXT: flat_store_byte v[0:1], v2 773; GFX8-NEXT: s_endpgm 774; 775; GFX9-NODL-LABEL: udot4_CommutationInsideMAD: 776; GFX9-NODL: ; %bb.0: ; %entry 777; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 778; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 779; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 780; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 781; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 782; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 783; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 784; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3] 785; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 786; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 787; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 788; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 789; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 790; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v1 791; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 792; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v2, v1, v3 793; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 794; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v7, v6, v1 795; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 796; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v5, v4, v1 797; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v9, v8, v1 798; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] 799; GFX9-NODL-NEXT: s_endpgm 800; 801; GFX9-DL-LABEL: udot4_CommutationInsideMAD: 802; GFX9-DL: ; %bb.0: ; %entry 803; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 804; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 805; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 806; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 807; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 808; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 809; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 810; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] 811; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 812; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 813; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 814; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 815; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 816; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v1 817; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 818; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v2, v1, v3 819; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 820; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v6, v1 821; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 822; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v4, v1 823; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v8, v1 824; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] 825; GFX9-DL-NEXT: s_endpgm 826; 827; GFX10-DL-LABEL: udot4_CommutationInsideMAD: 828; GFX10-DL: ; %bb.0: ; %entry 829; GFX10-DL-NEXT: s_clause 0x1 830; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 831; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 832; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 833; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 834; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 835; GFX10-DL-NEXT: s_clause 0x1 836; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] 837; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] 838; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] 839; GFX10-DL-NEXT: s_waitcnt vmcnt(2) 840; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2 841; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 842; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3 843; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 844; GFX10-DL-NEXT: v_mad_u16 v4, v3, v2, v4 845; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 846; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 847; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 848; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3 849; GFX10-DL-NEXT: v_mad_u16 v0, v5, v0, v4 850; GFX10-DL-NEXT: v_mad_u16 v0, v7, v6, v0 851; GFX10-DL-NEXT: v_mad_u16 v0, v3, v2, v0 852; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] 853; GFX10-DL-NEXT: s_endpgm 854 <4 x i8> addrspace(1)* %src2, 855 i8 addrspace(1)* nocapture %dst) { 856entry: 857 %idx = call i32 @llvm.amdgcn.workitem.id.x() 858 %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx 859 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1 860 %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx 861 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2 862 863 %v1e0 = extractelement <4 x i8> %vec1, i64 0 864 %v2e0 = extractelement <4 x i8> %vec2, i64 0 865 %mul1 = mul nuw nsw i8 %v2e0, %v1e0 866 867 %v1e1 = extractelement <4 x i8> %vec1, i64 1 868 %v2e1 = extractelement <4 x i8> %vec2, i64 1 869 %mul2 = mul nuw nsw i8 %v2e1, %v1e1 870 871 %v1e2 = extractelement <4 x i8> %vec1, i64 2 872 %v2e2 = extractelement <4 x i8> %vec2, i64 2 873 %mul3 = mul nuw nsw i8 %v2e2, %v1e2 874 875 %v1e3 = extractelement <4 x i8> %vec1, i64 3 876 %v2e3 = extractelement <4 x i8> %vec2, i64 3 877 %mul4 = mul nuw nsw i8 %v2e3, %v1e3 878 879 %acc = load i8, i8 addrspace(1)* %dst, align 2 880 %mad1 = add i8 %acc, %mul1 881 %mad2 = add i8 %mul2, %mad1 882 %mad3 = add i8 %mul3, %mad2 883 %mad4 = add i8 %mul4, %mad3 884 885 store i8 %mad4, i8 addrspace(1)* %dst, align 2 886 ret void 887} 888 889; TODO: Support commutation accross the adds. 890define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %src1, 891; GFX7-LABEL: udot4_CommutationAccrossMADs: 892; GFX7: ; %bb.0: ; %entry 893; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 894; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 895; GFX7-NEXT: s_mov_b32 s3, 0xf000 896; GFX7-NEXT: s_mov_b32 s10, 0 897; GFX7-NEXT: s_mov_b32 s11, s3 898; GFX7-NEXT: s_waitcnt lgkmcnt(0) 899; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 900; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 901; GFX7-NEXT: v_mov_b32_e32 v1, 0 902; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 903; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 904; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 905; GFX7-NEXT: s_mov_b32 s2, -1 906; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 907; GFX7-NEXT: s_waitcnt vmcnt(2) 908; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 909; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 910; GFX7-NEXT: s_waitcnt vmcnt(1) 911; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 912; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 913; GFX7-NEXT: s_waitcnt vmcnt(0) 914; GFX7-NEXT: v_mad_u32_u24 v1, v7, v4, v1 915; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 916; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 917; GFX7-NEXT: v_mad_u32_u24 v1, v6, v3, v1 918; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 919; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 920; GFX7-NEXT: v_mad_u32_u24 v1, v8, v5, v1 921; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 922; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 923; GFX7-NEXT: s_endpgm 924; 925; GFX8-LABEL: udot4_CommutationAccrossMADs: 926; GFX8: ; %bb.0: ; %entry 927; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 928; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 929; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 930; GFX8-NEXT: s_waitcnt lgkmcnt(0) 931; GFX8-NEXT: v_mov_b32_e32 v1, s5 932; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 933; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 934; GFX8-NEXT: flat_load_dword v3, v[0:1] 935; GFX8-NEXT: v_mov_b32_e32 v1, s7 936; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 937; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 938; GFX8-NEXT: flat_load_dword v2, v[0:1] 939; GFX8-NEXT: v_mov_b32_e32 v0, s0 940; GFX8-NEXT: v_mov_b32_e32 v1, s1 941; GFX8-NEXT: flat_load_ubyte v4, v[0:1] 942; GFX8-NEXT: s_waitcnt vmcnt(2) 943; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 944; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 945; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 946; GFX8-NEXT: s_waitcnt vmcnt(1) 947; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 948; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 949; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2 950; GFX8-NEXT: s_waitcnt vmcnt(0) 951; GFX8-NEXT: v_mad_u16 v4, v8, v7, v4 952; GFX8-NEXT: v_mad_u16 v2, v2, v3, v4 953; GFX8-NEXT: v_mad_u16 v2, v6, v5, v2 954; GFX8-NEXT: v_mad_u16 v2, v10, v9, v2 955; GFX8-NEXT: flat_store_byte v[0:1], v2 956; GFX8-NEXT: s_endpgm 957; 958; GFX9-NODL-LABEL: udot4_CommutationAccrossMADs: 959; GFX9-NODL: ; %bb.0: ; %entry 960; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 961; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 962; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 963; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 964; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 965; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 966; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 967; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3] 968; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 969; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 970; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 971; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 972; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 973; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 974; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v7, v6, v3 975; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 976; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v1 977; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v2, v1, v3 978; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 979; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v5, v4, v1 980; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v9, v8, v1 981; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] 982; GFX9-NODL-NEXT: s_endpgm 983; 984; GFX9-DL-LABEL: udot4_CommutationAccrossMADs: 985; GFX9-DL: ; %bb.0: ; %entry 986; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 987; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 988; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 989; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 990; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 991; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 992; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 993; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] 994; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 995; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 996; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 997; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 998; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 999; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1000; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v6, v3 1001; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 1002; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v1 1003; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v2, v1, v3 1004; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 1005; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v4, v1 1006; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v8, v1 1007; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] 1008; GFX9-DL-NEXT: s_endpgm 1009; 1010; GFX10-DL-LABEL: udot4_CommutationAccrossMADs: 1011; GFX10-DL: ; %bb.0: ; %entry 1012; GFX10-DL-NEXT: s_clause 0x1 1013; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1014; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1015; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1016; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 1017; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1018; GFX10-DL-NEXT: s_clause 0x1 1019; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] 1020; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] 1021; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] 1022; GFX10-DL-NEXT: s_waitcnt vmcnt(2) 1023; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2 1024; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 1025; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3 1026; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1027; GFX10-DL-NEXT: v_mad_u16 v0, v5, v0, v4 1028; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 1029; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 1030; GFX10-DL-NEXT: v_mad_u16 v0, v3, v2, v0 1031; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 1032; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3 1033; GFX10-DL-NEXT: v_mad_u16 v0, v5, v4, v0 1034; GFX10-DL-NEXT: v_mad_u16 v0, v3, v2, v0 1035; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] 1036; GFX10-DL-NEXT: s_endpgm 1037 <4 x i8> addrspace(1)* %src2, 1038 i8 addrspace(1)* nocapture %dst) { 1039entry: 1040 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1041 %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx 1042 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1 1043 %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx 1044 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2 1045 1046 %v1e0 = extractelement <4 x i8> %vec1, i64 0 1047 %v2e0 = extractelement <4 x i8> %vec2, i64 0 1048 %mul1 = mul nuw nsw i8 %v2e0, %v1e0 1049 1050 %v1e1 = extractelement <4 x i8> %vec1, i64 1 1051 %v2e1 = extractelement <4 x i8> %vec2, i64 1 1052 %mul2 = mul nuw nsw i8 %v2e1, %v1e1 1053 1054 %v1e2 = extractelement <4 x i8> %vec1, i64 2 1055 %v2e2 = extractelement <4 x i8> %vec2, i64 2 1056 %mul3 = mul nuw nsw i8 %v2e2, %v1e2 1057 1058 %v1e3 = extractelement <4 x i8> %vec1, i64 3 1059 %v2e3 = extractelement <4 x i8> %vec2, i64 3 1060 %mul4 = mul nuw nsw i8 %v2e3, %v1e3 1061 1062 %acc = load i8, i8 addrspace(1)* %dst, align 2 1063 %mad1 = add i8 %acc, %mul2 1064 %mad2 = add i8 %mad1, %mul1 1065 %mad3 = add i8 %mad2, %mul3 1066 %mad4 = add i8 %mad3, %mul4 1067 1068 store i8 %mad4, i8 addrspace(1)* %dst, align 2 1069 ret void 1070} 1071 1072define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1, 1073; GFX7-LABEL: udot4_multiuse_mul1: 1074; GFX7: ; %bb.0: ; %entry 1075; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1076; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1077; GFX7-NEXT: s_mov_b32 s3, 0xf000 1078; GFX7-NEXT: s_mov_b32 s10, 0 1079; GFX7-NEXT: s_mov_b32 s11, s3 1080; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1081; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 1082; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1083; GFX7-NEXT: v_mov_b32_e32 v1, 0 1084; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 1085; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 1086; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1087; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 1088; GFX7-NEXT: s_mov_b32 s2, -1 1089; GFX7-NEXT: s_waitcnt vmcnt(1) 1090; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 1091; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 1092; GFX7-NEXT: s_waitcnt vmcnt(0) 1093; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 1094; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 1095; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1096; GFX7-NEXT: v_mad_u32_u24 v8, v1, v5, s4 1097; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, v8 1098; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 1099; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 1100; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3 1101; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 1102; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 1103; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 1104; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 1105; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1106; GFX7-NEXT: s_endpgm 1107; 1108; GFX8-LABEL: udot4_multiuse_mul1: 1109; GFX8: ; %bb.0: ; %entry 1110; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1111; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1112; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1113; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1114; GFX8-NEXT: v_mov_b32_e32 v1, s5 1115; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1116; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1117; GFX8-NEXT: flat_load_dword v3, v[0:1] 1118; GFX8-NEXT: v_mov_b32_e32 v1, s7 1119; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 1120; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1121; GFX8-NEXT: flat_load_dword v0, v[0:1] 1122; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 1123; GFX8-NEXT: s_waitcnt vmcnt(1) 1124; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 1125; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 1126; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 8 1127; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 1128; GFX8-NEXT: s_waitcnt vmcnt(0) 1129; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 1130; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 1131; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1132; GFX8-NEXT: v_mad_u32_u24 v8, v1, v2, s2 1133; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v8 1134; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8 1135; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, v4 1136; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 1137; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1 1138; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 1139; GFX8-NEXT: v_mov_b32_e32 v0, s0 1140; GFX8-NEXT: v_mov_b32_e32 v1, s1 1141; GFX8-NEXT: flat_store_dword v[0:1], v2 1142; GFX8-NEXT: s_endpgm 1143; 1144; GFX9-NODL-LABEL: udot4_multiuse_mul1: 1145; GFX9-NODL: ; %bb.0: ; %entry 1146; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1147; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1148; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1149; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1150; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 1151; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 1152; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 1153; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1154; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 1155; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 1156; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1157; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v2 1158; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 1159; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 1160; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 1161; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v2, v3, v4 1162; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1163; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v3, v4, s0 1164; GFX9-NODL-NEXT: v_add3_u32 v2, v5, v3, v2 1165; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 1166; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 1167; GFX9-NODL-NEXT: s_endpgm 1168; 1169; GFX9-DL-LABEL: udot4_multiuse_mul1: 1170; GFX9-DL: ; %bb.0: ; %entry 1171; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1172; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1173; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1174; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1175; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 1176; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 1177; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 1178; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1179; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 1180; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xff, v1 1181; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1182; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xff, v2 1183; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 1184; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 1185; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 1186; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v4 1187; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1188; GFX9-DL-NEXT: v_mad_u32_u24 v3, v3, v4, s0 1189; GFX9-DL-NEXT: v_add3_u32 v2, v5, v3, v2 1190; GFX9-DL-NEXT: v_add3_u32 v1, v2, v6, v1 1191; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 1192; GFX9-DL-NEXT: s_endpgm 1193; 1194; GFX10-DL-LABEL: udot4_multiuse_mul1: 1195; GFX10-DL: ; %bb.0: ; %entry 1196; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1197; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1198; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1199; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1200; GFX10-DL-NEXT: s_clause 0x1 1201; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 1202; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 1203; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 1204; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 1205; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xff, v1 1206; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1207; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xff, v2 1208; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 1209; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v0, v3 1210; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1211; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v3, s2 1212; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 1213; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 1214; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 1215; GFX10-DL-NEXT: v_add3_u32 v0, v4, v0, v5 1216; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1 1217; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] 1218; GFX10-DL-NEXT: s_endpgm 1219 <4 x i8> addrspace(1)* %src2, 1220 i32 addrspace(1)* nocapture %dst) { 1221entry: 1222 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1223 %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx 1224 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1 1225 %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx 1226 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2 1227 1228 %v1e0 = extractelement <4 x i8> %vec1, i64 0 1229 %cv1e0 = zext i8 %v1e0 to i32 1230 %v2e0 = extractelement <4 x i8> %vec2, i64 0 1231 %cv2e0 = zext i8 %v2e0 to i32 1232 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 1233 1234 %v1e1 = extractelement <4 x i8> %vec1, i64 1 1235 %cv1e1 = zext i8 %v1e1 to i32 1236 %v2e1 = extractelement <4 x i8> %vec2, i64 1 1237 %cv2e1 = zext i8 %v2e1 to i32 1238 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 1239 1240 %v1e2 = extractelement <4 x i8> %vec1, i64 2 1241 %cv1e2 = zext i8 %v1e2 to i32 1242 %v2e2 = extractelement <4 x i8> %vec2, i64 2 1243 %cv2e2 = zext i8 %v2e2 to i32 1244 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 1245 1246 %v1e3 = extractelement <4 x i8> %vec1, i64 3 1247 %cv1e3 = zext i8 %v1e3 to i32 1248 %v2e3 = extractelement <4 x i8> %vec2, i64 3 1249 %cv2e3 = zext i8 %v2e3 to i32 1250 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 1251 1252 %acc = load i32, i32 addrspace(1)* %dst, align 4 1253 %add = add i32 %mul1, %acc 1254 %add1 = add i32 %mul2, %add 1255 %add2 = add i32 %add1, %mul1 1256 %add3 = add i32 %add2, %mul3 1257 %add4 = add i32 %add3, %mul4 1258 1259 store i32 %add4, i32 addrspace(1)* %dst, align 4 1260 ret void 1261} 1262 1263define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1, 1264; GFX7-LABEL: udot4_multiuse_add1: 1265; GFX7: ; %bb.0: ; %entry 1266; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1267; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1268; GFX7-NEXT: s_mov_b32 s3, 0xf000 1269; GFX7-NEXT: s_mov_b32 s10, 0 1270; GFX7-NEXT: s_mov_b32 s11, s3 1271; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1272; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 1273; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1274; GFX7-NEXT: v_mov_b32_e32 v1, 0 1275; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 1276; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 1277; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1278; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 1279; GFX7-NEXT: s_mov_b32 s2, -1 1280; GFX7-NEXT: s_waitcnt vmcnt(1) 1281; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 1282; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 1283; GFX7-NEXT: s_waitcnt vmcnt(0) 1284; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 1285; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 1286; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1287; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, s4 1288; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 1289; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 1290; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3 1291; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 1292; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 1293; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 1294; GFX7-NEXT: v_add_i32_e32 v6, vcc, s4, v3 1295; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 1296; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v6 1297; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1298; GFX7-NEXT: s_endpgm 1299; 1300; GFX8-LABEL: udot4_multiuse_add1: 1301; GFX8: ; %bb.0: ; %entry 1302; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1303; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1304; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1305; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1306; GFX8-NEXT: v_mov_b32_e32 v1, s5 1307; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1308; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1309; GFX8-NEXT: flat_load_dword v3, v[0:1] 1310; GFX8-NEXT: v_mov_b32_e32 v1, s7 1311; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 1312; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1313; GFX8-NEXT: flat_load_dword v0, v[0:1] 1314; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 1315; GFX8-NEXT: s_waitcnt vmcnt(1) 1316; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 1317; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 1318; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 8 1319; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 1320; GFX8-NEXT: s_waitcnt vmcnt(0) 1321; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 1322; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 1323; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1324; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, s2 1325; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8 1326; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, v4 1327; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 1328; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1 1329; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v4 1330; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v1 1331; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v5 1332; GFX8-NEXT: v_mov_b32_e32 v0, s0 1333; GFX8-NEXT: v_mov_b32_e32 v1, s1 1334; GFX8-NEXT: flat_store_dword v[0:1], v2 1335; GFX8-NEXT: s_endpgm 1336; 1337; GFX9-NODL-LABEL: udot4_multiuse_add1: 1338; GFX9-NODL: ; %bb.0: ; %entry 1339; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1340; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1341; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1342; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1343; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 1344; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 1345; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 1346; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1347; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 1348; GFX9-NODL-NEXT: v_bfe_u32 v4, v1, 8, 8 1349; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1350; GFX9-NODL-NEXT: v_bfe_u32 v5, v2, 8, 8 1351; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 1352; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 1353; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 1354; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1355; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v4, v5, s0 1356; GFX9-NODL-NEXT: v_add_u32_e32 v4, s0, v2 1357; GFX9-NODL-NEXT: v_add3_u32 v2, v2, v3, v6 1358; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v1, v4 1359; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 1360; GFX9-NODL-NEXT: s_endpgm 1361; 1362; GFX9-DL-LABEL: udot4_multiuse_add1: 1363; GFX9-DL: ; %bb.0: ; %entry 1364; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1365; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1366; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1367; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1368; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 1369; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 1370; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 1371; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1372; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 1373; GFX9-DL-NEXT: v_bfe_u32 v4, v1, 8, 8 1374; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1375; GFX9-DL-NEXT: v_bfe_u32 v5, v2, 8, 8 1376; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 1377; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 1378; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 1379; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1380; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, s0 1381; GFX9-DL-NEXT: v_add_u32_e32 v4, s0, v2 1382; GFX9-DL-NEXT: v_add3_u32 v2, v2, v3, v6 1383; GFX9-DL-NEXT: v_add3_u32 v1, v2, v1, v4 1384; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 1385; GFX9-DL-NEXT: s_endpgm 1386; 1387; GFX10-DL-LABEL: udot4_multiuse_add1: 1388; GFX10-DL: ; %bb.0: ; %entry 1389; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1390; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1391; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1392; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1393; GFX10-DL-NEXT: s_clause 0x1 1394; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 1395; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 1396; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 1397; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 1398; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 8, 8 1399; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1400; GFX10-DL-NEXT: v_bfe_u32 v3, v2, 8, 8 1401; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 1402; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1403; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v3, s2 1404; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 1405; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 1406; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s2, v0 1407; GFX10-DL-NEXT: v_add3_u32 v0, v0, v4, v3 1408; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 1409; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v2 1410; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] 1411; GFX10-DL-NEXT: s_endpgm 1412 <4 x i8> addrspace(1)* %src2, 1413 i32 addrspace(1)* nocapture %dst) { 1414entry: 1415 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1416 %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx 1417 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1 1418 %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx 1419 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2 1420 1421 %v1e0 = extractelement <4 x i8> %vec1, i64 0 1422 %cv1e0 = zext i8 %v1e0 to i32 1423 %v2e0 = extractelement <4 x i8> %vec2, i64 0 1424 %cv2e0 = zext i8 %v2e0 to i32 1425 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 1426 1427 %v1e1 = extractelement <4 x i8> %vec1, i64 1 1428 %cv1e1 = zext i8 %v1e1 to i32 1429 %v2e1 = extractelement <4 x i8> %vec2, i64 1 1430 %cv2e1 = zext i8 %v2e1 to i32 1431 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 1432 1433 %v1e2 = extractelement <4 x i8> %vec1, i64 2 1434 %cv1e2 = zext i8 %v1e2 to i32 1435 %v2e2 = extractelement <4 x i8> %vec2, i64 2 1436 %cv2e2 = zext i8 %v2e2 to i32 1437 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 1438 1439 %v1e3 = extractelement <4 x i8> %vec1, i64 3 1440 %cv1e3 = zext i8 %v1e3 to i32 1441 %v2e3 = extractelement <4 x i8> %vec2, i64 3 1442 %cv2e3 = zext i8 %v2e3 to i32 1443 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 1444 1445 %acc = load i32, i32 addrspace(1)* %dst, align 4 1446 %add1 = add i32 %mul2, %acc 1447 %add = add i32 %add1, %acc 1448 %add2 = add i32 %add1, %mul1 1449 %add3 = add i32 %add2, %mul3 1450 %add4 = add i32 %add3, %mul4 1451 %res = add i32 %add4, %add 1452 store i32 %res, i32 addrspace(1)* %dst, align 4 1453 ret void 1454} 1455 1456define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1, 1457; GFX7-LABEL: notdot4_mixedtypes: 1458; GFX7: ; %bb.0: ; %entry 1459; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1460; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1461; GFX7-NEXT: s_mov_b32 s3, 0xf000 1462; GFX7-NEXT: s_mov_b32 s10, 0 1463; GFX7-NEXT: s_mov_b32 s11, s3 1464; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1465; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 1466; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1467; GFX7-NEXT: v_mov_b32_e32 v1, 0 1468; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 1469; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 1470; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1471; GFX7-NEXT: s_mov_b32 s2, -1 1472; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 1473; GFX7-NEXT: s_waitcnt vmcnt(2) 1474; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8 1475; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 1476; GFX7-NEXT: s_waitcnt vmcnt(1) 1477; GFX7-NEXT: v_bfe_i32 v6, v0, 0, 8 1478; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 1479; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 1480; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 1481; GFX7-NEXT: s_waitcnt vmcnt(0) 1482; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 1483; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 1484; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 1485; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 1486; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 1487; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 1488; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 1489; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 1490; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 1491; GFX7-NEXT: s_endpgm 1492; 1493; GFX8-LABEL: notdot4_mixedtypes: 1494; GFX8: ; %bb.0: ; %entry 1495; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1496; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1497; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1498; GFX8-NEXT: v_mov_b32_e32 v5, 0xff 1499; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1500; GFX8-NEXT: v_mov_b32_e32 v1, s5 1501; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1502; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1503; GFX8-NEXT: flat_load_dword v3, v[0:1] 1504; GFX8-NEXT: v_mov_b32_e32 v1, s7 1505; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 1506; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1507; GFX8-NEXT: flat_load_dword v2, v[0:1] 1508; GFX8-NEXT: v_mov_b32_e32 v0, s0 1509; GFX8-NEXT: v_mov_b32_e32 v1, s1 1510; GFX8-NEXT: flat_load_ushort v4, v[0:1] 1511; GFX8-NEXT: s_waitcnt vmcnt(2) 1512; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 1513; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8 1514; GFX8-NEXT: v_bfe_i32 v6, v3, 0, 8 1515; GFX8-NEXT: v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1516; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 1517; GFX8-NEXT: s_waitcnt vmcnt(1) 1518; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2 1519; GFX8-NEXT: v_and_b32_e32 v9, 0xff, v9 1520; GFX8-NEXT: v_bfe_i32 v7, v2, 0, 8 1521; GFX8-NEXT: s_waitcnt vmcnt(0) 1522; GFX8-NEXT: v_mad_u16 v4, v8, v9, v4 1523; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1524; GFX8-NEXT: v_mad_u16 v4, v6, v7, v4 1525; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 1526; GFX8-NEXT: v_mad_u16 v4, v10, v5, v4 1527; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 1528; GFX8-NEXT: flat_store_short v[0:1], v2 1529; GFX8-NEXT: s_endpgm 1530; 1531; GFX9-NODL-LABEL: notdot4_mixedtypes: 1532; GFX9-NODL: ; %bb.0: ; %entry 1533; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1534; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1535; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1536; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff 1537; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1538; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 1539; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 1540; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1541; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] 1542; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 1543; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 1544; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 1545; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 1546; GFX9-NODL-NEXT: v_and_b32_e32 v6, 0xff, v6 1547; GFX9-NODL-NEXT: v_and_b32_e32 v7, 0xff, v7 1548; GFX9-NODL-NEXT: v_bfe_i32 v4, v1, 0, 8 1549; GFX9-NODL-NEXT: v_bfe_i32 v5, v2, 0, 8 1550; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1551; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3 1552; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1553; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1554; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 1555; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 1556; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 1557; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v8, v9, v3 1558; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 1559; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] 1560; GFX9-NODL-NEXT: s_endpgm 1561; 1562; GFX9-DL-LABEL: notdot4_mixedtypes: 1563; GFX9-DL: ; %bb.0: ; %entry 1564; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1565; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1566; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1567; GFX9-DL-NEXT: s_movk_i32 s0, 0xff 1568; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1569; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 1570; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 1571; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1572; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] 1573; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 1574; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 1575; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 1576; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 1577; GFX9-DL-NEXT: v_and_b32_e32 v6, 0xff, v6 1578; GFX9-DL-NEXT: v_and_b32_e32 v7, 0xff, v7 1579; GFX9-DL-NEXT: v_bfe_i32 v4, v1, 0, 8 1580; GFX9-DL-NEXT: v_bfe_i32 v5, v2, 0, 8 1581; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1582; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3 1583; GFX9-DL-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1584; GFX9-DL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1585; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 1586; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 1587; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 1588; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v9, v3 1589; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 1590; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] 1591; GFX9-DL-NEXT: s_endpgm 1592; 1593; GFX10-DL-LABEL: notdot4_mixedtypes: 1594; GFX10-DL: ; %bb.0: ; %entry 1595; GFX10-DL-NEXT: s_clause 0x1 1596; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1597; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1598; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1599; GFX10-DL-NEXT: v_mov_b32_e32 v7, 0xff 1600; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1601; GFX10-DL-NEXT: s_clause 0x1 1602; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 1603; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 1604; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 1605; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3] 1606; GFX10-DL-NEXT: s_waitcnt vmcnt(2) 1607; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1 1608; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 1609; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2 1610; GFX10-DL-NEXT: v_bfe_i32 v6, v1, 0, 8 1611; GFX10-DL-NEXT: v_bfe_i32 v8, v2, 0, 8 1612; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xff, v4 1613; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xff, v5 1614; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1615; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3 1616; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1617; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1618; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 1619; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 1620; GFX10-DL-NEXT: v_mad_u16 v3, v6, v8, v3 1621; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3 1622; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 1623; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3] 1624; GFX10-DL-NEXT: s_endpgm 1625 <4 x i8> addrspace(1)* %src2, 1626 i16 addrspace(1)* nocapture %dst) { 1627entry: 1628 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1629 %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx 1630 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1 1631 %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx 1632 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2 1633 1634 %v1e0 = extractelement <4 x i8> %vec1, i64 0 1635 %cv1e0 = sext i8 %v1e0 to i16 1636 %v2e0 = extractelement <4 x i8> %vec2, i64 0 1637 %cv2e0 = sext i8 %v2e0 to i16 1638 %mul1 = mul nuw nsw i16 %cv1e0, %cv2e0 1639 1640 %v1e1 = extractelement <4 x i8> %vec1, i64 1 1641 %cv1e1 = zext i8 %v1e1 to i16 1642 %v2e1 = extractelement <4 x i8> %vec2, i64 1 1643 %cv2e1 = zext i8 %v2e1 to i16 1644 %mul2 = mul nuw nsw i16 %cv1e1, %cv2e1 1645 1646 %v1e2 = extractelement <4 x i8> %vec1, i64 2 1647 %cv1e2 = zext i8 %v1e2 to i16 1648 %v2e2 = extractelement <4 x i8> %vec2, i64 2 1649 %cv2e2 = zext i8 %v2e2 to i16 1650 %mul3 = mul nuw nsw i16 %cv1e2, %cv2e2 1651 1652 %v1e3 = extractelement <4 x i8> %vec1, i64 3 1653 %cv1e3 = zext i8 %v1e3 to i16 1654 %v2e3 = extractelement <4 x i8> %vec2, i64 3 1655 %cv2e3 = zext i8 %v2e3 to i16 1656 %mul4 = mul nuw nsw i16 %cv1e3, %cv2e3 1657 1658 %acc = load i16, i16 addrspace(1)* %dst, align 2 1659 %add1 = add i16 %mul2, %acc 1660 %add2 = add i16 %add1, %mul1 1661 %add3 = add i16 %add2, %mul3 1662 %add4 = add i16 %add3, %mul4 1663 1664 store i16 %add4, i16 addrspace(1)* %dst, align 2 1665 ret void 1666} 1667 1668; TODO: cleanup s_lshr_b32 and support this pattern. 1669define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1, 1670; GFX7-LABEL: udot4_acc32_vecMul: 1671; GFX7: ; %bb.0: ; %entry 1672; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1673; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1674; GFX7-NEXT: s_mov_b32 s3, 0xf000 1675; GFX7-NEXT: s_mov_b32 s10, 0 1676; GFX7-NEXT: s_mov_b32 s11, s3 1677; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1678; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 1679; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1680; GFX7-NEXT: v_mov_b32_e32 v1, 0 1681; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 1682; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 1683; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1684; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 1685; GFX7-NEXT: s_mov_b32 s2, -1 1686; GFX7-NEXT: s_waitcnt vmcnt(1) 1687; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v2 1688; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 1689; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 1690; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 1691; GFX7-NEXT: s_waitcnt vmcnt(0) 1692; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 1693; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 1694; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 1695; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 1696; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1697; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s4 1698; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0 1699; GFX7-NEXT: v_mad_u32_u24 v0, v4, v7, v0 1700; GFX7-NEXT: v_mad_u32_u24 v0, v1, v5, v0 1701; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1702; GFX7-NEXT: s_endpgm 1703; 1704; GFX8-LABEL: udot4_acc32_vecMul: 1705; GFX8: ; %bb.0: ; %entry 1706; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1707; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1708; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1709; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1710; GFX8-NEXT: v_mov_b32_e32 v1, s5 1711; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1712; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1713; GFX8-NEXT: flat_load_dword v3, v[0:1] 1714; GFX8-NEXT: v_mov_b32_e32 v1, s7 1715; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 1716; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1717; GFX8-NEXT: flat_load_dword v0, v[0:1] 1718; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 1719; GFX8-NEXT: s_waitcnt vmcnt(1) 1720; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v3 1721; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 8 1722; GFX8-NEXT: v_lshrrev_b16_e32 v5, 8, v3 1723; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v3 1724; GFX8-NEXT: s_waitcnt vmcnt(0) 1725; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 1726; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 8 1727; GFX8-NEXT: v_lshrrev_b16_e32 v7, 8, v0 1728; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 1729; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1730; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s2 1731; GFX8-NEXT: v_mad_u32_u24 v0, v5, v7, v0 1732; GFX8-NEXT: v_mad_u32_u24 v0, v4, v6, v0 1733; GFX8-NEXT: v_mad_u32_u24 v2, v1, v2, v0 1734; GFX8-NEXT: v_mov_b32_e32 v0, s0 1735; GFX8-NEXT: v_mov_b32_e32 v1, s1 1736; GFX8-NEXT: flat_store_dword v[0:1], v2 1737; GFX8-NEXT: s_endpgm 1738; 1739; GFX9-NODL-LABEL: udot4_acc32_vecMul: 1740; GFX9-NODL: ; %bb.0: ; %entry 1741; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1742; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1743; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1744; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1745; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 1746; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 1747; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 1748; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1749; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1750; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 1751; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 1752; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 1753; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 1754; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1755; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4 1756; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 1757; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 1758; GFX9-NODL-NEXT: s_endpgm 1759; 1760; GFX9-DL-LABEL: udot4_acc32_vecMul: 1761; GFX9-DL: ; %bb.0: ; %entry 1762; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1763; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1764; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1765; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1766; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 1767; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 1768; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 1769; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1770; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1771; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 1772; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 1773; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 1774; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 1775; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1776; GFX9-DL-NEXT: v_add3_u32 v2, v3, s0, v4 1777; GFX9-DL-NEXT: v_add3_u32 v1, v2, v5, v1 1778; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 1779; GFX9-DL-NEXT: s_endpgm 1780; 1781; GFX10-DL-LABEL: udot4_acc32_vecMul: 1782; GFX10-DL: ; %bb.0: ; %entry 1783; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1784; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1785; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1786; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1787; GFX10-DL-NEXT: s_clause 0x1 1788; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 1789; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 1790; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0xffff 1791; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 1792; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 1793; GFX10-DL-NEXT: v_and_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 1794; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1795; GFX10-DL-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 1796; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 1797; GFX10-DL-NEXT: v_mul_u32_u24_e32 v0, v3, v0 1798; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 1799; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 1800; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 1801; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1802; GFX10-DL-NEXT: v_add3_u32 v0, v4, s2, v0 1803; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1 1804; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] 1805; GFX10-DL-NEXT: s_endpgm 1806 <4 x i8> addrspace(1)* %src2, 1807 i32 addrspace(1)* nocapture %dst) { 1808entry: 1809 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1810 %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx 1811 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1 1812 %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx 1813 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2 1814 1815 %cvec1 = zext <4 x i8> %vec1 to <4 x i32> 1816 %cvec2 = zext <4 x i8> %vec2 to <4 x i32> 1817 1818 %mul = mul <4 x i32> %cvec1, %cvec2 1819 %mul0 = extractelement <4 x i32> %mul, i64 0 1820 %mul1 = extractelement <4 x i32> %mul, i64 1 1821 %mul2 = extractelement <4 x i32> %mul, i64 2 1822 %mul3 = extractelement <4 x i32> %mul, i64 3 1823 1824 %acc = load i32, i32 addrspace(1)* %dst, align 4 1825 %add1 = add i32 %mul0, %acc 1826 %add2 = add i32 %add1, %mul1 1827 %add3 = add i32 %add2, %mul2 1828 %add4 = add i32 %add3, %mul3 1829 1830 store i32 %add4, i32 addrspace(1)* %dst, align 4 1831 ret void 1832} 1833 1834; TODO: This pattern should be recognized. 1835define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, 1836; GFX7-LABEL: udot4_acc16_vecMul: 1837; GFX7: ; %bb.0: ; %entry 1838; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1839; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1840; GFX7-NEXT: s_mov_b32 s3, 0xf000 1841; GFX7-NEXT: s_mov_b32 s10, 0 1842; GFX7-NEXT: s_mov_b32 s11, s3 1843; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1844; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 1845; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1846; GFX7-NEXT: v_mov_b32_e32 v1, 0 1847; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 1848; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 1849; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1850; GFX7-NEXT: s_mov_b32 s2, -1 1851; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 1852; GFX7-NEXT: s_waitcnt vmcnt(2) 1853; GFX7-NEXT: v_and_b32_e32 v3, 0xff00, v2 1854; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v2 1855; GFX7-NEXT: s_waitcnt vmcnt(1) 1856; GFX7-NEXT: v_and_b32_e32 v6, 0xff00, v0 1857; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 1858; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v0 1859; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 1860; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v6 1861; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 1862; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 1863; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3 1864; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v4 1865; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4 1866; GFX7-NEXT: s_waitcnt vmcnt(0) 1867; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1 1868; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 1869; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 1870; GFX7-NEXT: v_mad_u32_u24 v1, v6, v7, v1 1871; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 1872; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 1873; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 1874; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 1875; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 1876; GFX7-NEXT: s_endpgm 1877; 1878; GFX8-LABEL: udot4_acc16_vecMul: 1879; GFX8: ; %bb.0: ; %entry 1880; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1881; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1882; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1883; GFX8-NEXT: v_mov_b32_e32 v5, 0xff 1884; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1885; GFX8-NEXT: v_mov_b32_e32 v1, s5 1886; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1887; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1888; GFX8-NEXT: flat_load_dword v3, v[0:1] 1889; GFX8-NEXT: v_mov_b32_e32 v1, s7 1890; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 1891; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1892; GFX8-NEXT: flat_load_dword v2, v[0:1] 1893; GFX8-NEXT: v_mov_b32_e32 v0, s0 1894; GFX8-NEXT: v_mov_b32_e32 v1, s1 1895; GFX8-NEXT: flat_load_ushort v4, v[0:1] 1896; GFX8-NEXT: s_waitcnt vmcnt(2) 1897; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 1898; GFX8-NEXT: v_lshrrev_b16_e32 v7, 8, v3 1899; GFX8-NEXT: v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1900; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v3 1901; GFX8-NEXT: s_waitcnt vmcnt(1) 1902; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v2 1903; GFX8-NEXT: v_lshrrev_b16_e32 v9, 8, v2 1904; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1905; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2 1906; GFX8-NEXT: s_waitcnt vmcnt(0) 1907; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 1908; GFX8-NEXT: v_mad_u16 v2, v7, v9, v2 1909; GFX8-NEXT: v_mad_u16 v2, v10, v5, v2 1910; GFX8-NEXT: v_mad_u16 v2, v6, v8, v2 1911; GFX8-NEXT: flat_store_short v[0:1], v2 1912; GFX8-NEXT: s_endpgm 1913; 1914; GFX9-NODL-LABEL: udot4_acc16_vecMul: 1915; GFX9-NODL: ; %bb.0: ; %entry 1916; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1917; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1918; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1919; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff 1920; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0xffff 1921; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1922; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 1923; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 1924; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1925; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] 1926; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 1927; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v5, 8, v1 1928; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 24, v1 1929; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 1930; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v7, 8, v2 1931; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v2 1932; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1933; GFX9-NODL-NEXT: v_and_b32_sdwa v10, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1934; GFX9-NODL-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 1935; GFX9-NODL-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 1936; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v7, 16, v2 1937; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v5, 16, v1 1938; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xffff, v10 1939; GFX9-NODL-NEXT: v_and_b32_e32 v9, 0xffff, v9 1940; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 1941; GFX9-NODL-NEXT: v_lshl_or_b32 v4, v8, 16, v4 1942; GFX9-NODL-NEXT: v_lshl_or_b32 v5, v6, 16, v9 1943; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1944; GFX9-NODL-NEXT: v_add_u16_e32 v3, v1, v3 1945; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v5, v4 1946; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1947; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v2 1948; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1949; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] 1950; GFX9-NODL-NEXT: s_endpgm 1951; 1952; GFX9-DL-LABEL: udot4_acc16_vecMul: 1953; GFX9-DL: ; %bb.0: ; %entry 1954; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1955; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1956; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1957; GFX9-DL-NEXT: s_movk_i32 s0, 0xff 1958; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0xffff 1959; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1960; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 1961; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 1962; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1963; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] 1964; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 1965; GFX9-DL-NEXT: v_lshrrev_b16_e32 v5, 8, v1 1966; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 24, v1 1967; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 1968; GFX9-DL-NEXT: v_lshrrev_b16_e32 v7, 8, v2 1969; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v2 1970; GFX9-DL-NEXT: v_and_b32_sdwa v9, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1971; GFX9-DL-NEXT: v_and_b32_sdwa v10, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1972; GFX9-DL-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 1973; GFX9-DL-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 1974; GFX9-DL-NEXT: v_lshl_or_b32 v2, v7, 16, v2 1975; GFX9-DL-NEXT: v_lshl_or_b32 v1, v5, 16, v1 1976; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xffff, v10 1977; GFX9-DL-NEXT: v_and_b32_e32 v9, 0xffff, v9 1978; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 1979; GFX9-DL-NEXT: v_lshl_or_b32 v4, v8, 16, v4 1980; GFX9-DL-NEXT: v_lshl_or_b32 v5, v6, 16, v9 1981; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1982; GFX9-DL-NEXT: v_add_u16_e32 v3, v1, v3 1983; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v5, v4 1984; GFX9-DL-NEXT: v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1985; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v2 1986; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1987; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] 1988; GFX9-DL-NEXT: s_endpgm 1989; 1990; GFX10-DL-LABEL: udot4_acc16_vecMul: 1991; GFX10-DL: ; %bb.0: ; %entry 1992; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1993; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1994; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1995; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff 1996; GFX10-DL-NEXT: v_mov_b32_e32 v5, 0xff 1997; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1998; GFX10-DL-NEXT: s_clause 0x1 1999; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 2000; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 2001; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 2002; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] 2003; GFX10-DL-NEXT: s_waitcnt vmcnt(2) 2004; GFX10-DL-NEXT: v_lshrrev_b16 v6, 8, v1 2005; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2006; GFX10-DL-NEXT: v_lshrrev_b16 v7, 8, v2 2007; GFX10-DL-NEXT: v_and_b32_sdwa v8, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 2008; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 2009; GFX10-DL-NEXT: v_and_b32_sdwa v9, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2010; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2011; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 2012; GFX10-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8 2013; GFX10-DL-NEXT: v_lshl_or_b32 v4, v6, 16, v4 2014; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 2015; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 2016; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xffff, v9 2017; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 2018; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v5 2019; GFX10-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v6 2020; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 2021; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2022; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 2023; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 2024; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v5 2025; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2026; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1 2027; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3 2028; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1] 2029; GFX10-DL-NEXT: s_endpgm 2030 <4 x i8> addrspace(1)* %src2, 2031 i16 addrspace(1)* nocapture %dst) { 2032entry: 2033 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2034 %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx 2035 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1 2036 %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx 2037 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2 2038 2039 %cvec1 = zext <4 x i8> %vec1 to <4 x i16> 2040 %cvec2 = zext <4 x i8> %vec2 to <4 x i16> 2041 2042 %mul = mul <4 x i16> %cvec1, %cvec2 2043 %mul0 = extractelement <4 x i16> %mul, i64 0 2044 %mul1 = extractelement <4 x i16> %mul, i64 1 2045 %mul2 = extractelement <4 x i16> %mul, i64 2 2046 %mul3 = extractelement <4 x i16> %mul, i64 3 2047 2048 %acc = load i16, i16 addrspace(1)* %dst, align 4 2049 %add1 = add i16 %mul0, %acc 2050 %add2 = add i16 %add1, %mul1 2051 %add3 = add i16 %add2, %mul2 2052 %add4 = add i16 %add3, %mul3 2053 2054 store i16 %add4, i16 addrspace(1)* %dst, align 4 2055 ret void 2056} 2057 2058; TODO: Support this pattern. 2059define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1, 2060; GFX7-LABEL: udot4_acc8_vecMul: 2061; GFX7: ; %bb.0: ; %entry 2062; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2063; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2064; GFX7-NEXT: s_mov_b32 s3, 0xf000 2065; GFX7-NEXT: s_mov_b32 s10, 0 2066; GFX7-NEXT: s_mov_b32 s11, s3 2067; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2068; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 2069; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2070; GFX7-NEXT: v_mov_b32_e32 v1, 0 2071; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 2072; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 2073; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2074; GFX7-NEXT: s_mov_b32 s2, -1 2075; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 2076; GFX7-NEXT: s_waitcnt vmcnt(2) 2077; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v2 2078; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8 2079; GFX7-NEXT: s_waitcnt vmcnt(1) 2080; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v0 2081; GFX7-NEXT: v_bfe_u32 v8, v0, 8, 8 2082; GFX7-NEXT: s_waitcnt vmcnt(0) 2083; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 2084; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 2085; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 2086; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 2087; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 2088; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 2089; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 2090; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0 2091; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 2092; GFX7-NEXT: s_endpgm 2093; 2094; GFX8-LABEL: udot4_acc8_vecMul: 2095; GFX8: ; %bb.0: ; %entry 2096; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2097; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2098; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2099; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2100; GFX8-NEXT: v_mov_b32_e32 v1, s5 2101; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 2102; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2103; GFX8-NEXT: flat_load_dword v3, v[0:1] 2104; GFX8-NEXT: v_mov_b32_e32 v1, s7 2105; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 2106; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2107; GFX8-NEXT: flat_load_dword v2, v[0:1] 2108; GFX8-NEXT: v_mov_b32_e32 v0, s0 2109; GFX8-NEXT: v_mov_b32_e32 v1, s1 2110; GFX8-NEXT: flat_load_ubyte v4, v[0:1] 2111; GFX8-NEXT: s_waitcnt vmcnt(2) 2112; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 2113; GFX8-NEXT: s_waitcnt vmcnt(1) 2114; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 2115; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 2116; GFX8-NEXT: v_mul_lo_u16_e32 v9, v5, v6 2117; GFX8-NEXT: v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2118; GFX8-NEXT: v_mul_lo_u16_sdwa v8, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 2119; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v7 2120; GFX8-NEXT: v_or_b32_e32 v8, v8, v9 2121; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v8 2122; GFX8-NEXT: s_waitcnt vmcnt(0) 2123; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 2124; GFX8-NEXT: v_add_u16_e32 v2, v2, v8 2125; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v7 2126; GFX8-NEXT: v_mad_u16 v2, v5, v6, v2 2127; GFX8-NEXT: v_add_u16_e32 v2, v2, v7 2128; GFX8-NEXT: flat_store_byte v[0:1], v2 2129; GFX8-NEXT: s_endpgm 2130; 2131; GFX9-NODL-LABEL: udot4_acc8_vecMul: 2132; GFX9-NODL: ; %bb.0: ; %entry 2133; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2134; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2135; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2136; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2137; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 2138; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 2139; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2140; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3] 2141; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 2142; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 2143; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 2144; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 2145; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 2146; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v8, v4, v5 2147; GFX9-NODL-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2148; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 2149; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v8, 16, v6 2150; GFX9-NODL-NEXT: v_or_b32_e32 v7, v7, v8 2151; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v7 2152; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2153; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 2154; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v7 2155; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v6 2156; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 2157; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v6 2158; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] 2159; GFX9-NODL-NEXT: s_endpgm 2160; 2161; GFX9-DL-LABEL: udot4_acc8_vecMul: 2162; GFX9-DL: ; %bb.0: ; %entry 2163; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2164; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2165; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2166; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2167; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 2168; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 2169; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2170; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] 2171; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 2172; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 2173; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2174; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 2175; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 2176; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v4, v5 2177; GFX9-DL-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2178; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 2179; GFX9-DL-NEXT: v_lshlrev_b32_e32 v8, 16, v6 2180; GFX9-DL-NEXT: v_or_b32_e32 v7, v7, v8 2181; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v7 2182; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2183; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 2184; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v7 2185; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v6 2186; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 2187; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 2188; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] 2189; GFX9-DL-NEXT: s_endpgm 2190; 2191; GFX10-DL-LABEL: udot4_acc8_vecMul: 2192; GFX10-DL: ; %bb.0: ; %entry 2193; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2194; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2195; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2196; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2197; GFX10-DL-NEXT: s_clause 0x1 2198; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 2199; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 2200; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 2201; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[0:1] 2202; GFX10-DL-NEXT: s_waitcnt vmcnt(2) 2203; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v1 2204; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2205; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v2 2206; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v1 2207; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v2 2208; GFX10-DL-NEXT: v_lshrrev_b16 v8, 8, v2 2209; GFX10-DL-NEXT: v_mul_lo_u16 v4, v4, v5 2210; GFX10-DL-NEXT: v_lshrrev_b16 v5, 8, v1 2211; GFX10-DL-NEXT: v_mul_lo_u16 v9, v6, v7 2212; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2213; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 2214; GFX10-DL-NEXT: v_lshlrev_b16 v4, 8, v4 2215; GFX10-DL-NEXT: v_mul_lo_u16 v5, v5, v8 2216; GFX10-DL-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2217; GFX10-DL-NEXT: v_lshlrev_b16 v5, 8, v5 2218; GFX10-DL-NEXT: v_lshlrev_b32_e32 v8, 16, v4 2219; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v4 2220; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2221; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v5 2222; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v5 2223; GFX10-DL-NEXT: v_mad_u16 v1, v6, v7, v1 2224; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v2 2225; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1] 2226; GFX10-DL-NEXT: s_endpgm 2227 <4 x i8> addrspace(1)* %src2, 2228 i8 addrspace(1)* nocapture %dst) { 2229entry: 2230 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2231 %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx 2232 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1 2233 %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx 2234 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2 2235 2236 %mul = mul <4 x i8> %vec1, %vec2 2237 %mul0 = extractelement <4 x i8> %mul, i64 0 2238 %mul1 = extractelement <4 x i8> %mul, i64 1 2239 %mul2 = extractelement <4 x i8> %mul, i64 2 2240 %mul3 = extractelement <4 x i8> %mul, i64 3 2241 2242 %acc = load i8, i8 addrspace(1)* %dst, align 4 2243 %add1 = add i8 %mul0, %acc 2244 %add2 = add i8 %add1, %mul1 2245 %add3 = add i8 %add2, %mul2 2246 %add4 = add i8 %add3, %mul3 2247 2248 store i8 %add4, i8 addrspace(1)* %dst, align 4 2249 ret void 2250} 2251 2252declare i32 @llvm.amdgcn.workitem.id.x() 2253