1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NODL %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-DL %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-DL %s 7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-DL %s 8 9; add(mul(S0.x, S1.y), 10; add (mul (S0.y, S1.y), S3)) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3) 11 12define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1, 13; GFX7-LABEL: udot2: 14; GFX7: ; %bb.0: ; %entry 15; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 16; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 17; GFX7-NEXT: s_mov_b32 s3, 0xf000 18; GFX7-NEXT: s_mov_b32 s10, 0 19; GFX7-NEXT: s_mov_b32 s11, s3 20; GFX7-NEXT: s_waitcnt lgkmcnt(0) 21; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 22; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 23; GFX7-NEXT: v_mov_b32_e32 v1, 0 24; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 25; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 26; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 27; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 28; GFX7-NEXT: s_mov_b32 s4, 0xffff 29; GFX7-NEXT: s_mov_b32 s2, -1 30; GFX7-NEXT: s_waitcnt vmcnt(1) 31; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 32; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 33; GFX7-NEXT: s_waitcnt vmcnt(0) 34; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 35; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 36; GFX7-NEXT: s_waitcnt lgkmcnt(0) 37; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s5 38; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 39; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 40; GFX7-NEXT: s_endpgm 41; 42; GFX8-LABEL: udot2: 43; GFX8: ; %bb.0: ; %entry 44; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 45; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 46; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 47; GFX8-NEXT: s_mov_b32 s2, 0xffff 48; GFX8-NEXT: s_waitcnt lgkmcnt(0) 49; GFX8-NEXT: v_mov_b32_e32 v1, s5 50; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 51; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 52; GFX8-NEXT: flat_load_dword v3, v[0:1] 53; GFX8-NEXT: v_mov_b32_e32 v1, s7 54; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 55; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 56; GFX8-NEXT: flat_load_dword v0, v[0:1] 57; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 58; GFX8-NEXT: s_waitcnt vmcnt(1) 59; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 60; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 61; GFX8-NEXT: s_waitcnt vmcnt(0) 62; GFX8-NEXT: v_and_b32_e32 v2, s2, v0 63; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 64; GFX8-NEXT: s_waitcnt lgkmcnt(0) 65; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s3 66; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 67; GFX8-NEXT: v_mov_b32_e32 v0, s0 68; GFX8-NEXT: v_mov_b32_e32 v1, s1 69; GFX8-NEXT: flat_store_dword v[0:1], v2 70; GFX8-NEXT: s_endpgm 71; 72; GFX9-NODL-LABEL: udot2: 73; GFX9-NODL: ; %bb.0: ; %entry 74; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 75; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 76; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 77; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 78; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 79; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 80; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 81; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 82; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 83; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 84; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 85; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 86; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 87; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 88; GFX9-NODL-NEXT: s_endpgm 89; 90; GFX9-DL-LABEL: udot2: 91; GFX9-DL: ; %bb.0: ; %entry 92; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 93; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 94; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 95; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 96; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 97; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 98; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 99; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 100; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 101; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 102; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 103; GFX9-DL-NEXT: s_endpgm 104; 105; GFX10-DL-LABEL: udot2: 106; GFX10-DL: ; %bb.0: ; %entry 107; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 108; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 109; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 110; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 111; GFX10-DL-NEXT: s_clause 0x1 112; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 113; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 114; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 115; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 116; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 117; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 118; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] 119; GFX10-DL-NEXT: s_endpgm 120 <2 x i16> addrspace(1)* %src2, 121 i32 addrspace(1)* nocapture %dst) { 122entry: 123 %idx = call i32 @llvm.amdgcn.workitem.id.x() 124 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 125 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 126 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 127 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 128 129 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 130 %conv = zext i16 %s1.elt1 to i32 131 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 132 %conv2 = zext i16 %s2.elt1 to i32 133 %mul1 = mul nuw i32 %conv2, %conv 134 135 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 136 %conv3 = zext i16 %s1.elt2 to i32 137 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 138 %conv4 = zext i16 %s2.elt2 to i32 139 %mul2 = mul nuw i32 %conv4, %conv3 140 141 %s3 = load i32, i32 addrspace(1)* %dst, align 4 142 %add = add i32 %mul2, %s3 143 %add6 = add i32 %add, %mul1 144 store i32 %add6, i32 addrspace(1)* %dst, align 4 145 ret void 146} 147 148; TODO: Support this pattern 149; add(S3, 150; add (mul (S0.y, S1.y), mul (S0.y, S1.y))) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3) 151define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1, 152; GFX7-LABEL: udot2_MulMul: 153; GFX7: ; %bb.0: ; %entry 154; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 155; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 156; GFX7-NEXT: s_mov_b32 s3, 0xf000 157; GFX7-NEXT: s_mov_b32 s10, 0 158; GFX7-NEXT: s_mov_b32 s11, s3 159; GFX7-NEXT: s_waitcnt lgkmcnt(0) 160; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 161; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 162; GFX7-NEXT: v_mov_b32_e32 v1, 0 163; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 164; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 165; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 166; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 167; GFX7-NEXT: s_mov_b32 s4, 0xffff 168; GFX7-NEXT: s_mov_b32 s2, -1 169; GFX7-NEXT: s_waitcnt vmcnt(1) 170; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 171; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 172; GFX7-NEXT: s_waitcnt vmcnt(0) 173; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 174; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 175; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v2 176; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0 177; GFX7-NEXT: s_waitcnt lgkmcnt(0) 178; GFX7-NEXT: v_add_i32_e32 v0, vcc, s5, v0 179; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 180; GFX7-NEXT: s_endpgm 181; 182; GFX8-LABEL: udot2_MulMul: 183; GFX8: ; %bb.0: ; %entry 184; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 185; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 186; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 187; GFX8-NEXT: s_waitcnt lgkmcnt(0) 188; GFX8-NEXT: v_mov_b32_e32 v1, s5 189; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 190; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 191; GFX8-NEXT: flat_load_dword v3, v[0:1] 192; GFX8-NEXT: v_mov_b32_e32 v1, s7 193; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 194; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 195; GFX8-NEXT: flat_load_dword v0, v[0:1] 196; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 197; GFX8-NEXT: s_waitcnt vmcnt(1) 198; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 199; GFX8-NEXT: s_waitcnt vmcnt(0) 200; GFX8-NEXT: v_mul_u32_u24_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 201; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 202; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, v1 203; GFX8-NEXT: s_waitcnt lgkmcnt(0) 204; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0 205; GFX8-NEXT: v_mov_b32_e32 v0, s0 206; GFX8-NEXT: v_mov_b32_e32 v1, s1 207; GFX8-NEXT: flat_store_dword v[0:1], v2 208; GFX8-NEXT: s_endpgm 209; 210; GFX9-NODL-LABEL: udot2_MulMul: 211; GFX9-NODL: ; %bb.0: ; %entry 212; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 213; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 214; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 215; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 216; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 217; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 218; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 219; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 220; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 221; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 222; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 223; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 224; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, s0 225; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 226; GFX9-NODL-NEXT: s_endpgm 227; 228; GFX9-DL-LABEL: udot2_MulMul: 229; GFX9-DL: ; %bb.0: ; %entry 230; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 231; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 232; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 233; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 234; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 235; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 236; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 237; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 238; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 239; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 240; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 241; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 242; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, s0 243; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 244; GFX9-DL-NEXT: s_endpgm 245; 246; GFX10-DL-LABEL: udot2_MulMul: 247; GFX10-DL: ; %bb.0: ; %entry 248; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 249; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 250; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 251; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 252; GFX10-DL-NEXT: s_clause 0x1 253; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 254; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 255; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 256; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 257; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 258; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 259; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 260; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 261; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, s2 262; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] 263; GFX10-DL-NEXT: s_endpgm 264 <2 x i16> addrspace(1)* %src2, 265 i32 addrspace(1)* nocapture %dst) { 266entry: 267 %idx = call i32 @llvm.amdgcn.workitem.id.x() 268 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 269 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 270 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 271 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 272 273 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 274 %conv = zext i16 %s1.elt1 to i32 275 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 276 %conv2 = zext i16 %s2.elt1 to i32 277 %mul1 = mul nuw i32 %conv2, %conv 278 279 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 280 %conv3 = zext i16 %s1.elt2 to i32 281 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 282 %conv4 = zext i16 %s2.elt2 to i32 283 %mul2 = mul nuw i32 %conv4, %conv3 284 %s3 = load i32, i32 addrspace(1)* %dst, align 4 285 %add = add i32 %mul2, %mul1 286 %add6 = add i32 %add, %s3 287 store i32 %add6, i32 addrspace(1)* %dst, align 4 288 ret void 289} 290 291define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1, 292; GFX7-LABEL: idot2: 293; GFX7: ; %bb.0: ; %entry 294; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 295; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 296; GFX7-NEXT: s_mov_b32 s3, 0xf000 297; GFX7-NEXT: s_mov_b32 s10, 0 298; GFX7-NEXT: s_mov_b32 s11, s3 299; GFX7-NEXT: s_waitcnt lgkmcnt(0) 300; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 301; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 302; GFX7-NEXT: v_mov_b32_e32 v1, 0 303; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 304; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 305; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 306; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 307; GFX7-NEXT: s_mov_b32 s2, -1 308; GFX7-NEXT: s_waitcnt vmcnt(1) 309; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16 310; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 311; GFX7-NEXT: s_waitcnt vmcnt(0) 312; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16 313; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 314; GFX7-NEXT: s_waitcnt lgkmcnt(0) 315; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s4 316; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0 317; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 318; GFX7-NEXT: s_endpgm 319; 320; GFX8-LABEL: idot2: 321; GFX8: ; %bb.0: ; %entry 322; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 323; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 324; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 325; GFX8-NEXT: s_waitcnt lgkmcnt(0) 326; GFX8-NEXT: v_mov_b32_e32 v1, s5 327; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 328; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 329; GFX8-NEXT: flat_load_dword v3, v[0:1] 330; GFX8-NEXT: v_mov_b32_e32 v1, s7 331; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 332; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 333; GFX8-NEXT: flat_load_dword v0, v[0:1] 334; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 335; GFX8-NEXT: s_waitcnt vmcnt(1) 336; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 337; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 338; GFX8-NEXT: s_waitcnt vmcnt(0) 339; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 340; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 341; GFX8-NEXT: s_waitcnt lgkmcnt(0) 342; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s2 343; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 344; GFX8-NEXT: v_mov_b32_e32 v0, s0 345; GFX8-NEXT: v_mov_b32_e32 v1, s1 346; GFX8-NEXT: flat_store_dword v[0:1], v2 347; GFX8-NEXT: s_endpgm 348; 349; GFX9-NODL-LABEL: idot2: 350; GFX9-NODL: ; %bb.0: ; %entry 351; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 352; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 353; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 354; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 355; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 356; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 357; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 358; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 359; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 360; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 361; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 362; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 363; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 364; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 365; GFX9-NODL-NEXT: s_endpgm 366; 367; GFX9-DL-LABEL: idot2: 368; GFX9-DL: ; %bb.0: ; %entry 369; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 370; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 371; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 372; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 373; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 374; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 375; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 376; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 377; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 378; GFX9-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s0 379; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 380; GFX9-DL-NEXT: s_endpgm 381; 382; GFX10-DL-LABEL: idot2: 383; GFX10-DL: ; %bb.0: ; %entry 384; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 385; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 386; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 387; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 388; GFX10-DL-NEXT: s_clause 0x1 389; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 390; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 391; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 392; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 393; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 394; GFX10-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s2 395; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] 396; GFX10-DL-NEXT: s_endpgm 397 <2 x i16> addrspace(1)* %src2, 398 i32 addrspace(1)* nocapture %dst) { 399entry: 400 %idx = call i32 @llvm.amdgcn.workitem.id.x() 401 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 402 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 403 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 404 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 405 406 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 407 %conv = sext i16 %s1.elt1 to i32 408 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 409 %conv2 = sext i16 %s2.elt1 to i32 410 %mul1 = mul nuw i32 %conv2, %conv 411 412 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 413 %conv3 = sext i16 %s1.elt2 to i32 414 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 415 %conv4 = sext i16 %s2.elt2 to i32 416 %mul2 = mul nuw i32 %conv4, %conv3 417 418 %s3 = load i32, i32 addrspace(1)* %dst, align 4 419 %add = add i32 %mul2, %s3 420 %add6 = add i32 %add, %mul1 421 store i32 %add6, i32 addrspace(1)* %dst, align 4 422 ret void 423} 424 425define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1, 426; GFX7-LABEL: idot2_MixedTypedMul: 427; GFX7: ; %bb.0: ; %entry 428; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 429; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 430; GFX7-NEXT: s_mov_b32 s3, 0xf000 431; GFX7-NEXT: s_mov_b32 s10, 0 432; GFX7-NEXT: s_mov_b32 s11, s3 433; GFX7-NEXT: s_waitcnt lgkmcnt(0) 434; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 435; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 436; GFX7-NEXT: v_mov_b32_e32 v1, 0 437; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 438; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 439; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 440; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 441; GFX7-NEXT: s_mov_b32 s2, -1 442; GFX7-NEXT: s_waitcnt vmcnt(1) 443; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 444; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16 445; GFX7-NEXT: s_waitcnt vmcnt(0) 446; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 447; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 448; GFX7-NEXT: s_waitcnt lgkmcnt(0) 449; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s4 450; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v1 451; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 452; GFX7-NEXT: s_endpgm 453; 454; GFX8-LABEL: idot2_MixedTypedMul: 455; GFX8: ; %bb.0: ; %entry 456; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 457; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 458; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 459; GFX8-NEXT: s_waitcnt lgkmcnt(0) 460; GFX8-NEXT: v_mov_b32_e32 v1, s5 461; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 462; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 463; GFX8-NEXT: flat_load_dword v3, v[0:1] 464; GFX8-NEXT: v_mov_b32_e32 v1, s7 465; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 466; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 467; GFX8-NEXT: flat_load_dword v0, v[0:1] 468; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 469; GFX8-NEXT: s_waitcnt vmcnt(1) 470; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 471; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 472; GFX8-NEXT: s_waitcnt vmcnt(0) 473; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 474; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 475; GFX8-NEXT: s_waitcnt lgkmcnt(0) 476; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2 477; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 478; GFX8-NEXT: v_mov_b32_e32 v0, s0 479; GFX8-NEXT: v_mov_b32_e32 v1, s1 480; GFX8-NEXT: flat_store_dword v[0:1], v2 481; GFX8-NEXT: s_endpgm 482; 483; GFX9-NODL-LABEL: idot2_MixedTypedMul: 484; GFX9-NODL: ; %bb.0: ; %entry 485; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 486; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 487; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 488; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 489; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 490; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 491; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 492; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 493; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 494; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 495; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 496; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 497; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 498; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 499; GFX9-NODL-NEXT: s_endpgm 500; 501; GFX9-DL-LABEL: idot2_MixedTypedMul: 502; GFX9-DL: ; %bb.0: ; %entry 503; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 504; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 505; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 506; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 507; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 508; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 509; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 510; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 511; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 512; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 513; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 514; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 515; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3 516; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 517; GFX9-DL-NEXT: s_endpgm 518; 519; GFX10-DL-LABEL: idot2_MixedTypedMul: 520; GFX10-DL: ; %bb.0: ; %entry 521; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 522; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 523; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 524; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 525; GFX10-DL-NEXT: s_clause 0x1 526; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 527; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 528; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 529; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 530; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 531; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 532; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 533; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 534; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0 535; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] 536; GFX10-DL-NEXT: s_endpgm 537 <2 x i16> addrspace(1)* %src2, 538 i32 addrspace(1)* nocapture %dst) { 539entry: 540 %idx = call i32 @llvm.amdgcn.workitem.id.x() 541 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 542 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 543 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 544 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 545 546 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 547 %conv = sext i16 %s1.elt1 to i32 548 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 549 %conv2 = sext i16 %s2.elt1 to i32 550 %mul1 = mul nuw i32 %conv2, %conv 551 552 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 553 %conv3 = zext i16 %s1.elt2 to i32 554 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 555 %conv4 = zext i16 %s2.elt2 to i32 556 %mul2 = mul nuw i32 %conv4, %conv3 557 558 %s3 = load i32, i32 addrspace(1)* %dst, align 4 559 %add = add i32 %mul2, %s3 560 %add6 = add i32 %add, %mul1 561 store i32 %add6, i32 addrspace(1)* %dst, align 4 562 ret void 563} 564 565define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1, 566; GFX7-LABEL: udot2_alt_AddOperands: 567; GFX7: ; %bb.0: ; %entry 568; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 569; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 570; GFX7-NEXT: s_mov_b32 s3, 0xf000 571; GFX7-NEXT: s_mov_b32 s10, 0 572; GFX7-NEXT: s_mov_b32 s11, s3 573; GFX7-NEXT: s_waitcnt lgkmcnt(0) 574; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 575; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 576; GFX7-NEXT: v_mov_b32_e32 v1, 0 577; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 578; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 579; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 580; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 581; GFX7-NEXT: s_mov_b32 s4, 0xffff 582; GFX7-NEXT: s_mov_b32 s2, -1 583; GFX7-NEXT: s_waitcnt vmcnt(1) 584; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 585; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 586; GFX7-NEXT: s_waitcnt vmcnt(0) 587; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 588; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 589; GFX7-NEXT: s_waitcnt lgkmcnt(0) 590; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s5 591; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 592; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 593; GFX7-NEXT: s_endpgm 594; 595; GFX8-LABEL: udot2_alt_AddOperands: 596; GFX8: ; %bb.0: ; %entry 597; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 598; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 599; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 600; GFX8-NEXT: s_mov_b32 s2, 0xffff 601; GFX8-NEXT: s_waitcnt lgkmcnt(0) 602; GFX8-NEXT: v_mov_b32_e32 v1, s5 603; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 604; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 605; GFX8-NEXT: flat_load_dword v3, v[0:1] 606; GFX8-NEXT: v_mov_b32_e32 v1, s7 607; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 608; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 609; GFX8-NEXT: flat_load_dword v0, v[0:1] 610; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 611; GFX8-NEXT: s_waitcnt vmcnt(1) 612; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 613; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 614; GFX8-NEXT: s_waitcnt vmcnt(0) 615; GFX8-NEXT: v_and_b32_e32 v2, s2, v0 616; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 617; GFX8-NEXT: s_waitcnt lgkmcnt(0) 618; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s3 619; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 620; GFX8-NEXT: v_mov_b32_e32 v0, s0 621; GFX8-NEXT: v_mov_b32_e32 v1, s1 622; GFX8-NEXT: flat_store_dword v[0:1], v2 623; GFX8-NEXT: s_endpgm 624; 625; GFX9-NODL-LABEL: udot2_alt_AddOperands: 626; GFX9-NODL: ; %bb.0: ; %entry 627; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 628; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 629; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 630; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff 631; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 632; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 633; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 634; GFX9-NODL-NEXT: s_load_dword s1, s[2:3], 0x0 635; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 636; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 637; GFX9-NODL-NEXT: v_and_b32_e32 v3, s0, v1 638; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 639; GFX9-NODL-NEXT: v_and_b32_e32 v4, s0, v2 640; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 641; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 642; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 643; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s1 644; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v4, v3, v1 645; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 646; GFX9-NODL-NEXT: s_endpgm 647; 648; GFX9-DL-LABEL: udot2_alt_AddOperands: 649; GFX9-DL: ; %bb.0: ; %entry 650; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 651; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 652; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 653; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 654; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 655; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 656; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 657; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 658; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 659; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 660; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 661; GFX9-DL-NEXT: s_endpgm 662; 663; GFX10-DL-LABEL: udot2_alt_AddOperands: 664; GFX10-DL: ; %bb.0: ; %entry 665; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 666; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 667; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 668; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 669; GFX10-DL-NEXT: s_clause 0x1 670; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 671; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 672; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 673; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 674; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 675; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 676; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] 677; GFX10-DL-NEXT: s_endpgm 678 <2 x i16> addrspace(1)* %src2, 679 i32 addrspace(1)* nocapture %dst) { 680entry: 681 %idx = call i32 @llvm.amdgcn.workitem.id.x() 682 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 683 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 684 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 685 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 686 687 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 688 %conv = zext i16 %s1.elt1 to i32 689 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 690 %conv2 = zext i16 %s2.elt1 to i32 691 %mul1 = mul nuw i32 %conv2, %conv 692 693 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 694 %conv3 = zext i16 %s1.elt2 to i32 695 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 696 %conv4 = zext i16 %s2.elt2 to i32 697 %mul2 = mul nuw i32 %conv4, %conv3 698 699 %s3 = load i32, i32 addrspace(1)* %dst, align 4 700 %add = add i32 %s3, %mul2 701 %add6 = add i32 %mul1, %add 702 store i32 %add6, i32 addrspace(1)* %dst, align 4 703 ret void 704} 705 706define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1, 707; GFX7-LABEL: idot2_MixedExt: 708; GFX7: ; %bb.0: ; %entry 709; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 710; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 711; GFX7-NEXT: s_mov_b32 s3, 0xf000 712; GFX7-NEXT: s_mov_b32 s10, 0 713; GFX7-NEXT: s_mov_b32 s11, s3 714; GFX7-NEXT: s_waitcnt lgkmcnt(0) 715; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 716; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 717; GFX7-NEXT: v_mov_b32_e32 v1, 0 718; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 719; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 720; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 721; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 722; GFX7-NEXT: s_mov_b32 s2, -1 723; GFX7-NEXT: s_waitcnt vmcnt(1) 724; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16 725; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 726; GFX7-NEXT: s_waitcnt vmcnt(0) 727; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0 728; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 729; GFX7-NEXT: s_waitcnt lgkmcnt(0) 730; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s4 731; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0 732; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 733; GFX7-NEXT: s_endpgm 734; 735; GFX8-LABEL: idot2_MixedExt: 736; GFX8: ; %bb.0: ; %entry 737; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 738; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 739; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 740; GFX8-NEXT: s_waitcnt lgkmcnt(0) 741; GFX8-NEXT: v_mov_b32_e32 v1, s5 742; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 743; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 744; GFX8-NEXT: flat_load_dword v3, v[0:1] 745; GFX8-NEXT: v_mov_b32_e32 v1, s7 746; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 747; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 748; GFX8-NEXT: flat_load_dword v0, v[0:1] 749; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 750; GFX8-NEXT: s_waitcnt vmcnt(1) 751; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 752; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 753; GFX8-NEXT: s_waitcnt vmcnt(0) 754; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 755; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 756; GFX8-NEXT: s_waitcnt lgkmcnt(0) 757; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s2 758; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 759; GFX8-NEXT: v_mov_b32_e32 v0, s0 760; GFX8-NEXT: v_mov_b32_e32 v1, s1 761; GFX8-NEXT: flat_store_dword v[0:1], v2 762; GFX8-NEXT: s_endpgm 763; 764; GFX9-NODL-LABEL: idot2_MixedExt: 765; GFX9-NODL: ; %bb.0: ; %entry 766; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 767; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 768; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 769; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 770; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 771; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 772; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 773; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 774; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 775; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 776; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 777; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 778; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 779; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 780; GFX9-NODL-NEXT: s_endpgm 781; 782; GFX9-DL-LABEL: idot2_MixedExt: 783; GFX9-DL: ; %bb.0: ; %entry 784; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 785; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 786; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 787; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 788; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 789; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 790; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 791; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 792; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 793; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 794; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 795; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 796; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3 797; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 798; GFX9-DL-NEXT: s_endpgm 799; 800; GFX10-DL-LABEL: idot2_MixedExt: 801; GFX10-DL: ; %bb.0: ; %entry 802; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 803; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 804; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 805; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 806; GFX10-DL-NEXT: s_clause 0x1 807; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 808; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 809; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 810; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 811; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 812; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 813; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 814; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 815; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0 816; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] 817; GFX10-DL-NEXT: s_endpgm 818 <2 x i16> addrspace(1)* %src2, 819 i32 addrspace(1)* nocapture %dst) { 820entry: 821 %idx = call i32 @llvm.amdgcn.workitem.id.x() 822 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 823 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 824 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 825 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 826 827 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 828 %conv = sext i16 %s1.elt1 to i32 829 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 830 %conv2 = zext i16 %s2.elt1 to i32 831 %mul1 = mul nuw i32 %conv2, %conv 832 833 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 834 %conv3 = sext i16 %s1.elt2 to i32 835 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 836 %conv4 = sext i16 %s2.elt2 to i32 837 %mul2 = mul nuw i32 %conv4, %conv3 838 839 %s3 = load i32, i32 addrspace(1)* %dst, align 4 840 %add = add i32 %mul2, %s3 841 %add6 = add i32 %add, %mul1 842 store i32 %add6, i32 addrspace(1)* %dst, align 4 843 ret void 844} 845 846define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1, 847; GFX7-LABEL: notudot2_SameVec: 848; GFX7: ; %bb.0: ; %entry 849; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 850; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 851; GFX7-NEXT: s_mov_b32 s3, 0xf000 852; GFX7-NEXT: s_mov_b32 s10, 0 853; GFX7-NEXT: s_mov_b32 s11, s3 854; GFX7-NEXT: s_waitcnt lgkmcnt(0) 855; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 856; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 857; GFX7-NEXT: v_mov_b32_e32 v1, 0 858; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 859; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 860; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 861; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 862; GFX7-NEXT: s_mov_b32 s2, -1 863; GFX7-NEXT: s_waitcnt vmcnt(1) 864; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 865; GFX7-NEXT: s_waitcnt vmcnt(0) 866; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 867; GFX7-NEXT: s_waitcnt lgkmcnt(0) 868; GFX7-NEXT: v_mad_u32_u24 v0, v0, v0, s4 869; GFX7-NEXT: v_mad_u32_u24 v0, v1, v1, v0 870; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 871; GFX7-NEXT: s_endpgm 872; 873; GFX8-LABEL: notudot2_SameVec: 874; GFX8: ; %bb.0: ; %entry 875; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 876; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 877; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 878; GFX8-NEXT: s_waitcnt lgkmcnt(0) 879; GFX8-NEXT: v_mov_b32_e32 v1, s5 880; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 881; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 882; GFX8-NEXT: flat_load_dword v3, v[0:1] 883; GFX8-NEXT: v_mov_b32_e32 v1, s7 884; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 885; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 886; GFX8-NEXT: flat_load_dword v0, v[0:1] 887; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 888; GFX8-NEXT: s_waitcnt vmcnt(1) 889; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 890; GFX8-NEXT: s_waitcnt vmcnt(0) 891; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 892; GFX8-NEXT: s_waitcnt lgkmcnt(0) 893; GFX8-NEXT: v_mad_u32_u24 v0, v0, v0, s2 894; GFX8-NEXT: v_mad_u32_u24 v2, v1, v1, v0 895; GFX8-NEXT: v_mov_b32_e32 v0, s0 896; GFX8-NEXT: v_mov_b32_e32 v1, s1 897; GFX8-NEXT: flat_store_dword v[0:1], v2 898; GFX8-NEXT: s_endpgm 899; 900; GFX9-NODL-LABEL: notudot2_SameVec: 901; GFX9-NODL: ; %bb.0: ; %entry 902; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 903; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 904; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 905; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 906; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 907; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 908; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 909; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 910; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 911; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 912; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 913; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 914; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 915; GFX9-NODL-NEXT: v_add3_u32 v1, v2, s0, v1 916; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 917; GFX9-NODL-NEXT: s_endpgm 918; 919; GFX9-DL-LABEL: notudot2_SameVec: 920; GFX9-DL: ; %bb.0: ; %entry 921; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 922; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 923; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 924; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 925; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 926; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 927; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 928; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 929; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 930; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 931; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 932; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 933; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 934; GFX9-DL-NEXT: v_add3_u32 v1, v2, s0, v1 935; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 936; GFX9-DL-NEXT: s_endpgm 937; 938; GFX10-DL-LABEL: notudot2_SameVec: 939; GFX10-DL: ; %bb.0: ; %entry 940; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 941; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 942; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 943; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 944; GFX10-DL-NEXT: s_clause 0x1 945; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 946; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 947; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 948; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 949; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 950; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 951; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 952; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 953; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 954; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0 955; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] 956; GFX10-DL-NEXT: s_endpgm 957 <2 x i16> addrspace(1)* %src2, 958 i32 addrspace(1)* nocapture %dst) { 959entry: 960 %idx = call i32 @llvm.amdgcn.workitem.id.x() 961 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 962 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 963 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 964 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 965 966 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 967 %conv = zext i16 %s1.elt1 to i32 968 %s2.elt1 = extractelement <2 x i16> %vec1, i64 0 969 %conv2 = zext i16 %s2.elt1 to i32 970 %mul1 = mul i32 %conv2, %conv 971 972 %s1.elt2 = extractelement <2 x i16> %vec2, i64 1 973 %conv3 = zext i16 %s1.elt2 to i32 974 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 975 %conv4 = zext i16 %s2.elt2 to i32 976 %mul2 = mul i32 %conv4, %conv3 977 978 %s3 = load i32, i32 addrspace(1)* %dst, align 4 979 %add = add i32 %mul2, %s3 980 %add6 = add i32 %add, %mul1 981 store i32 %add6, i32 addrspace(1)* %dst, align 4 982 ret void 983} 984 985define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1, 986; GFX7-LABEL: udot2_v4i16: 987; GFX7: ; %bb.0: ; %entry 988; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 989; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 990; GFX7-NEXT: s_mov_b32 s3, 0xf000 991; GFX7-NEXT: s_mov_b32 s10, 0 992; GFX7-NEXT: s_mov_b32 s11, s3 993; GFX7-NEXT: s_waitcnt lgkmcnt(0) 994; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 995; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 996; GFX7-NEXT: v_mov_b32_e32 v1, 0 997; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] 998; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] 999; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 1000; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1001; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 1002; GFX7-NEXT: s_mov_b32 s4, 0xffff 1003; GFX7-NEXT: s_mov_b32 s2, -1 1004; GFX7-NEXT: s_waitcnt vmcnt(1) 1005; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 1006; GFX7-NEXT: s_waitcnt vmcnt(0) 1007; GFX7-NEXT: v_and_b32_e32 v3, s4, v0 1008; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1009; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1010; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1011; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, s5 1012; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0 1013; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1014; GFX7-NEXT: s_endpgm 1015; 1016; GFX8-LABEL: udot2_v4i16: 1017; GFX8: ; %bb.0: ; %entry 1018; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1019; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1020; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1021; GFX8-NEXT: s_mov_b32 s2, 0xffff 1022; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1023; GFX8-NEXT: v_mov_b32_e32 v1, s5 1024; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1025; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1026; GFX8-NEXT: v_mov_b32_e32 v3, s7 1027; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2 1028; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1029; GFX8-NEXT: flat_load_dword v0, v[0:1] 1030; GFX8-NEXT: flat_load_dword v1, v[2:3] 1031; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 1032; GFX8-NEXT: s_waitcnt vmcnt(1) 1033; GFX8-NEXT: v_and_b32_e32 v2, s2, v0 1034; GFX8-NEXT: s_waitcnt vmcnt(0) 1035; GFX8-NEXT: v_and_b32_e32 v3, s2, v1 1036; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1037; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1038; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1039; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s3 1040; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v0 1041; GFX8-NEXT: v_mov_b32_e32 v0, s0 1042; GFX8-NEXT: v_mov_b32_e32 v1, s1 1043; GFX8-NEXT: flat_store_dword v[0:1], v2 1044; GFX8-NEXT: s_endpgm 1045; 1046; GFX9-NODL-LABEL: udot2_v4i16: 1047; GFX9-NODL: ; %bb.0: ; %entry 1048; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1049; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1050; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1051; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1052; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 1053; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 1054; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 1055; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1056; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1057; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1058; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1059; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1060; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 1061; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 1062; GFX9-NODL-NEXT: s_endpgm 1063; 1064; GFX9-DL-LABEL: udot2_v4i16: 1065; GFX9-DL: ; %bb.0: ; %entry 1066; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1067; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1068; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1069; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1070; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 1071; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 1072; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 1073; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1074; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1075; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 1076; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 1077; GFX9-DL-NEXT: s_endpgm 1078; 1079; GFX10-DL-LABEL: udot2_v4i16: 1080; GFX10-DL: ; %bb.0: ; %entry 1081; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1082; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1083; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1084; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1085; GFX10-DL-NEXT: s_clause 0x1 1086; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 1087; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 1088; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 1089; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 1090; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1091; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 1092; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] 1093; GFX10-DL-NEXT: s_endpgm 1094 <4 x i16> addrspace(1)* %src2, 1095 i32 addrspace(1)* nocapture %dst) { 1096entry: 1097 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1098 %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src1, i32 %idx 1099 %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %gep1 1100 %gep2 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src2, i32 %idx 1101 %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %gep2 1102 1103 %s1.elt1 = extractelement <4 x i16> %vec1, i64 0 1104 %conv = zext i16 %s1.elt1 to i32 1105 %s2.elt1 = extractelement <4 x i16> %vec2, i64 0 1106 %conv2 = zext i16 %s2.elt1 to i32 1107 %mul1 = mul i32 %conv2, %conv 1108 1109 %s1.elt2 = extractelement <4 x i16> %vec1, i64 1 1110 %conv3 = zext i16 %s1.elt2 to i32 1111 %s2.elt2 = extractelement <4 x i16> %vec2, i64 1 1112 %conv4 = zext i16 %s2.elt2 to i32 1113 %mul2 = mul i32 %conv4, %conv3 1114 1115 %s3 = load i32, i32 addrspace(1)* %dst, align 4 1116 %add = add i32 %mul2, %s3 1117 %add6 = add i32 %add, %mul1 1118 store i32 %add6, i32 addrspace(1)* %dst, align 4 1119 ret void 1120} 1121 1122define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1, 1123; GFX7-LABEL: udot2_v4i16_Hi: 1124; GFX7: ; %bb.0: ; %entry 1125; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1126; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1127; GFX7-NEXT: s_mov_b32 s3, 0xf000 1128; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1129; GFX7-NEXT: v_mov_b32_e32 v1, 0 1130; GFX7-NEXT: s_mov_b32 s10, 0 1131; GFX7-NEXT: s_mov_b32 s11, s3 1132; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1133; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 1134; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 1135; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 1136; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4 1137; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 1138; GFX7-NEXT: s_mov_b32 s4, 0xffff 1139; GFX7-NEXT: s_mov_b32 s2, -1 1140; GFX7-NEXT: s_waitcnt vmcnt(1) 1141; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 1142; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1143; GFX7-NEXT: s_waitcnt vmcnt(0) 1144; GFX7-NEXT: v_and_b32_e32 v3, s4, v0 1145; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1146; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1147; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, s5 1148; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0 1149; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1150; GFX7-NEXT: s_endpgm 1151; 1152; GFX8-LABEL: udot2_v4i16_Hi: 1153; GFX8: ; %bb.0: ; %entry 1154; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1155; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1156; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1157; GFX8-NEXT: s_mov_b32 s2, 0xffff 1158; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1159; GFX8-NEXT: v_mov_b32_e32 v1, s5 1160; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v0 1161; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1162; GFX8-NEXT: v_mov_b32_e32 v3, s7 1163; GFX8-NEXT: v_add_u32_e32 v4, vcc, s6, v0 1164; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1165; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2 1166; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1167; GFX8-NEXT: flat_load_dword v2, v[0:1] 1168; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 1169; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 1170; GFX8-NEXT: flat_load_dword v0, v[0:1] 1171; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 1172; GFX8-NEXT: s_waitcnt vmcnt(1) 1173; GFX8-NEXT: v_and_b32_e32 v1, s2, v2 1174; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1175; GFX8-NEXT: s_waitcnt vmcnt(0) 1176; GFX8-NEXT: v_and_b32_e32 v3, s2, v0 1177; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1178; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1179; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, s3 1180; GFX8-NEXT: v_mad_u32_u24 v2, v3, v1, v0 1181; GFX8-NEXT: v_mov_b32_e32 v0, s0 1182; GFX8-NEXT: v_mov_b32_e32 v1, s1 1183; GFX8-NEXT: flat_store_dword v[0:1], v2 1184; GFX8-NEXT: s_endpgm 1185; 1186; GFX9-NODL-LABEL: udot2_v4i16_Hi: 1187; GFX9-NODL: ; %bb.0: ; %entry 1188; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1189; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1190; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1191; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1192; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 1193; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] offset:4 1194; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 1195; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1196; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1197; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1198; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1199; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1200; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 1201; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 1202; GFX9-NODL-NEXT: s_endpgm 1203; 1204; GFX9-DL-LABEL: udot2_v4i16_Hi: 1205; GFX9-DL: ; %bb.0: ; %entry 1206; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1207; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1208; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1209; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1210; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 1211; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] offset:4 1212; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 1213; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1214; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1215; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 1216; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 1217; GFX9-DL-NEXT: s_endpgm 1218; 1219; GFX10-DL-LABEL: udot2_v4i16_Hi: 1220; GFX10-DL: ; %bb.0: ; %entry 1221; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1222; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1223; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1224; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1225; GFX10-DL-NEXT: s_clause 0x1 1226; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 1227; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] offset:4 1228; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 1229; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 1230; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1231; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 1232; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] 1233; GFX10-DL-NEXT: s_endpgm 1234 <4 x i16> addrspace(1)* %src2, 1235 i32 addrspace(1)* nocapture %dst) { 1236entry: 1237 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1238 %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src1, i32 %idx 1239 %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %gep1 1240 %gep2 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src2, i32 %idx 1241 %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %gep2 1242 1243 %s1.elt1 = extractelement <4 x i16> %vec1, i64 2 1244 %conv = zext i16 %s1.elt1 to i32 1245 %s2.elt1 = extractelement <4 x i16> %vec2, i64 2 1246 %conv2 = zext i16 %s2.elt1 to i32 1247 %mul1 = mul i32 %conv2, %conv 1248 1249 %s1.elt2 = extractelement <4 x i16> %vec1, i64 3 1250 %conv3 = zext i16 %s1.elt2 to i32 1251 %s2.elt2 = extractelement <4 x i16> %vec2, i64 3 1252 %conv4 = zext i16 %s2.elt2 to i32 1253 %mul2 = mul i32 %conv4, %conv3 1254 1255 %s3 = load i32, i32 addrspace(1)* %dst, align 4 1256 %add = add i32 %mul2, %s3 1257 %add6 = add i32 %add, %mul1 1258 store i32 %add6, i32 addrspace(1)* %dst, align 4 1259 ret void 1260} 1261 1262define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1, 1263; GFX7-LABEL: notudot2_v4i16_Even: 1264; GFX7: ; %bb.0: ; %entry 1265; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1266; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1267; GFX7-NEXT: s_mov_b32 s3, 0xf000 1268; GFX7-NEXT: s_mov_b32 s10, 0 1269; GFX7-NEXT: s_mov_b32 s11, s3 1270; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1271; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 1272; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1273; GFX7-NEXT: v_mov_b32_e32 v1, 0 1274; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] 1275; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] 1276; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 1277; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 1278; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 1279; GFX7-NEXT: s_mov_b32 s4, 0xffff 1280; GFX7-NEXT: s_mov_b32 s2, -1 1281; GFX7-NEXT: s_waitcnt vmcnt(1) 1282; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 1283; GFX7-NEXT: s_waitcnt vmcnt(0) 1284; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 1285; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 1286; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 1287; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1288; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, s5 1289; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 1290; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1291; GFX7-NEXT: s_endpgm 1292; 1293; GFX8-LABEL: notudot2_v4i16_Even: 1294; GFX8: ; %bb.0: ; %entry 1295; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1296; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1297; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1298; GFX8-NEXT: s_mov_b32 s2, 0xffff 1299; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1300; GFX8-NEXT: v_mov_b32_e32 v1, s5 1301; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1302; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1303; GFX8-NEXT: v_mov_b32_e32 v3, s7 1304; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2 1305; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1306; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1307; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 1308; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 1309; GFX8-NEXT: s_waitcnt vmcnt(1) 1310; GFX8-NEXT: v_and_b32_e32 v1, s2, v1 1311; GFX8-NEXT: s_waitcnt vmcnt(0) 1312; GFX8-NEXT: v_and_b32_e32 v3, s2, v3 1313; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 1314; GFX8-NEXT: v_and_b32_e32 v2, s2, v2 1315; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1316; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s3 1317; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1 1318; GFX8-NEXT: v_mov_b32_e32 v0, s0 1319; GFX8-NEXT: v_mov_b32_e32 v1, s1 1320; GFX8-NEXT: flat_store_dword v[0:1], v2 1321; GFX8-NEXT: s_endpgm 1322; 1323; GFX9-NODL-LABEL: notudot2_v4i16_Even: 1324; GFX9-NODL: ; %bb.0: ; %entry 1325; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1326; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1327; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1328; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1329; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] 1330; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] 1331; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 1332; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0 1333; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1334; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1335; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1336; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1337; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s0, v0 1338; GFX9-NODL-NEXT: global_store_dword v4, v0, s[2:3] 1339; GFX9-NODL-NEXT: s_endpgm 1340; 1341; GFX9-DL-LABEL: notudot2_v4i16_Even: 1342; GFX9-DL: ; %bb.0: ; %entry 1343; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1344; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1345; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1346; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1347; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] 1348; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] 1349; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 1350; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0 1351; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1352; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1353; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1354; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1355; GFX9-DL-NEXT: v_add3_u32 v0, v1, s0, v0 1356; GFX9-DL-NEXT: global_store_dword v4, v0, s[2:3] 1357; GFX9-DL-NEXT: s_endpgm 1358; 1359; GFX10-DL-LABEL: notudot2_v4i16_Even: 1360; GFX10-DL: ; %bb.0: ; %entry 1361; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1362; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1363; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1364; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1365; GFX10-DL-NEXT: s_clause 0x1 1366; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] 1367; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] 1368; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 1369; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1370; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1371; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1372; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 1373; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1374; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0 1375; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] 1376; GFX10-DL-NEXT: s_endpgm 1377 <4 x i16> addrspace(1)* %src2, 1378 i32 addrspace(1)* nocapture %dst) { 1379entry: 1380 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1381 %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src1, i32 %idx 1382 %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %gep1 1383 %gep2 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src2, i32 %idx 1384 %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %gep2 1385 1386 %s1.elt1 = extractelement <4 x i16> %vec1, i64 0 1387 %conv = zext i16 %s1.elt1 to i32 1388 %s2.elt1 = extractelement <4 x i16> %vec2, i64 0 1389 %conv2 = zext i16 %s2.elt1 to i32 1390 %mul1 = mul i32 %conv2, %conv 1391 1392 %s1.elt2 = extractelement <4 x i16> %vec1, i64 2 1393 %conv3 = zext i16 %s1.elt2 to i32 1394 %s2.elt2 = extractelement <4 x i16> %vec2, i64 2 1395 %conv4 = zext i16 %s2.elt2 to i32 1396 %mul2 = mul i32 %conv4, %conv3 1397 1398 %s3 = load i32, i32 addrspace(1)* %dst, align 4 1399 %add = add i32 %mul2, %s3 1400 %add6 = add i32 %add, %mul1 1401 store i32 %add6, i32 addrspace(1)* %dst, align 4 1402 ret void 1403} 1404 1405define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1, 1406; GFX7-LABEL: notudot2_v4i16_Middle: 1407; GFX7: ; %bb.0: ; %entry 1408; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1409; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1410; GFX7-NEXT: s_mov_b32 s3, 0xf000 1411; GFX7-NEXT: s_mov_b32 s10, 0 1412; GFX7-NEXT: s_mov_b32 s11, s3 1413; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1414; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 1415; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1416; GFX7-NEXT: v_mov_b32_e32 v1, 0 1417; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] 1418; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] 1419; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 1420; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 1421; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 1422; GFX7-NEXT: s_mov_b32 s4, 0xffff 1423; GFX7-NEXT: s_mov_b32 s2, -1 1424; GFX7-NEXT: s_waitcnt vmcnt(1) 1425; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 1426; GFX7-NEXT: s_waitcnt vmcnt(0) 1427; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 1428; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1429; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1430; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1431; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, s5 1432; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 1433; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1434; GFX7-NEXT: s_endpgm 1435; 1436; GFX8-LABEL: notudot2_v4i16_Middle: 1437; GFX8: ; %bb.0: ; %entry 1438; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1439; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1440; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1441; GFX8-NEXT: s_mov_b32 s2, 0xffff 1442; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1443; GFX8-NEXT: v_mov_b32_e32 v1, s5 1444; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1445; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1446; GFX8-NEXT: v_mov_b32_e32 v3, s7 1447; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2 1448; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1449; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1450; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 1451; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 1452; GFX8-NEXT: s_waitcnt vmcnt(1) 1453; GFX8-NEXT: v_and_b32_e32 v1, s2, v1 1454; GFX8-NEXT: s_waitcnt vmcnt(0) 1455; GFX8-NEXT: v_and_b32_e32 v3, s2, v3 1456; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1457; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1458; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1459; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s3 1460; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1 1461; GFX8-NEXT: v_mov_b32_e32 v0, s0 1462; GFX8-NEXT: v_mov_b32_e32 v1, s1 1463; GFX8-NEXT: flat_store_dword v[0:1], v2 1464; GFX8-NEXT: s_endpgm 1465; 1466; GFX9-NODL-LABEL: notudot2_v4i16_Middle: 1467; GFX9-NODL: ; %bb.0: ; %entry 1468; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1469; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1470; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1471; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1472; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] 1473; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] 1474; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 1475; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0 1476; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1477; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1478; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1479; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1480; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s0, v0 1481; GFX9-NODL-NEXT: global_store_dword v4, v0, s[2:3] 1482; GFX9-NODL-NEXT: s_endpgm 1483; 1484; GFX9-DL-LABEL: notudot2_v4i16_Middle: 1485; GFX9-DL: ; %bb.0: ; %entry 1486; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1487; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1488; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1489; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1490; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] 1491; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] 1492; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 1493; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0 1494; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1495; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1496; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1497; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1498; GFX9-DL-NEXT: v_add3_u32 v0, v1, s0, v0 1499; GFX9-DL-NEXT: global_store_dword v4, v0, s[2:3] 1500; GFX9-DL-NEXT: s_endpgm 1501; 1502; GFX10-DL-LABEL: notudot2_v4i16_Middle: 1503; GFX10-DL: ; %bb.0: ; %entry 1504; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1505; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1506; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1507; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1508; GFX10-DL-NEXT: s_clause 0x1 1509; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] 1510; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] 1511; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 1512; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1513; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1514; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1515; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 1516; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1517; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0 1518; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] 1519; GFX10-DL-NEXT: s_endpgm 1520 <4 x i16> addrspace(1)* %src2, 1521 i32 addrspace(1)* nocapture %dst) { 1522entry: 1523 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1524 %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src1, i32 %idx 1525 %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %gep1 1526 %gep2 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src2, i32 %idx 1527 %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %gep2 1528 1529 %s1.elt1 = extractelement <4 x i16> %vec1, i64 1 1530 %conv = zext i16 %s1.elt1 to i32 1531 %s2.elt1 = extractelement <4 x i16> %vec2, i64 1 1532 %conv2 = zext i16 %s2.elt1 to i32 1533 %mul1 = mul i32 %conv2, %conv 1534 1535 %s1.elt2 = extractelement <4 x i16> %vec1, i64 2 1536 %conv3 = zext i16 %s1.elt2 to i32 1537 %s2.elt2 = extractelement <4 x i16> %vec2, i64 2 1538 %conv4 = zext i16 %s2.elt2 to i32 1539 %mul2 = mul i32 %conv4, %conv3 1540 1541 %s3 = load i32, i32 addrspace(1)* %dst, align 4 1542 %add = add i32 %mul2, %s3 1543 %add6 = add i32 %add, %mul1 1544 store i32 %add6, i32 addrspace(1)* %dst, align 4 1545 ret void 1546} 1547 1548define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1, 1549; GFX7-LABEL: notudot2_DiffIndex: 1550; GFX7: ; %bb.0: ; %entry 1551; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1552; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1553; GFX7-NEXT: s_mov_b32 s3, 0xf000 1554; GFX7-NEXT: s_mov_b32 s10, 0 1555; GFX7-NEXT: s_mov_b32 s11, s3 1556; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1557; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 1558; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1559; GFX7-NEXT: v_mov_b32_e32 v1, 0 1560; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 1561; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 1562; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1563; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 1564; GFX7-NEXT: s_mov_b32 s4, 0xffff 1565; GFX7-NEXT: s_mov_b32 s2, -1 1566; GFX7-NEXT: s_waitcnt vmcnt(1) 1567; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 1568; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 1569; GFX7-NEXT: s_waitcnt vmcnt(0) 1570; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 1571; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 1572; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1573; GFX7-NEXT: v_mad_u32_u24 v0, v0, v1, s5 1574; GFX7-NEXT: v_mad_u32_u24 v0, v3, v2, v0 1575; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1576; GFX7-NEXT: s_endpgm 1577; 1578; GFX8-LABEL: notudot2_DiffIndex: 1579; GFX8: ; %bb.0: ; %entry 1580; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1581; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1582; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1583; GFX8-NEXT: s_mov_b32 s2, 0xffff 1584; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1585; GFX8-NEXT: v_mov_b32_e32 v1, s5 1586; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1587; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1588; GFX8-NEXT: flat_load_dword v3, v[0:1] 1589; GFX8-NEXT: v_mov_b32_e32 v1, s7 1590; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 1591; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1592; GFX8-NEXT: flat_load_dword v0, v[0:1] 1593; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 1594; GFX8-NEXT: s_waitcnt vmcnt(1) 1595; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 1596; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1597; GFX8-NEXT: s_waitcnt vmcnt(0) 1598; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 1599; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 1600; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1601; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s3 1602; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 1603; GFX8-NEXT: v_mov_b32_e32 v0, s0 1604; GFX8-NEXT: v_mov_b32_e32 v1, s1 1605; GFX8-NEXT: flat_store_dword v[0:1], v2 1606; GFX8-NEXT: s_endpgm 1607; 1608; GFX9-NODL-LABEL: notudot2_DiffIndex: 1609; GFX9-NODL: ; %bb.0: ; %entry 1610; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1611; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1612; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1613; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1614; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 1615; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 1616; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 1617; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1618; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1619; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0 1620; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1 1621; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1622; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 1623; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 1624; GFX9-NODL-NEXT: s_endpgm 1625; 1626; GFX9-DL-LABEL: notudot2_DiffIndex: 1627; GFX9-DL: ; %bb.0: ; %entry 1628; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1629; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1630; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1631; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1632; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 1633; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 1634; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 1635; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1636; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1637; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0 1638; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1 1639; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1640; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3 1641; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 1642; GFX9-DL-NEXT: s_endpgm 1643; 1644; GFX10-DL-LABEL: notudot2_DiffIndex: 1645; GFX10-DL: ; %bb.0: ; %entry 1646; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1647; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1648; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1649; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1650; GFX10-DL-NEXT: s_clause 0x1 1651; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 1652; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 1653; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 1654; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1655; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0 1656; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1 1657; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 1658; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1659; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0 1660; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] 1661; GFX10-DL-NEXT: s_endpgm 1662 <2 x i16> addrspace(1)* %src2, 1663 i32 addrspace(1)* nocapture %dst) { 1664entry: 1665 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1666 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 1667 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 1668 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 1669 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 1670 1671 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 1672 %conv = zext i16 %s1.elt1 to i32 1673 %s2.elt1 = extractelement <2 x i16> %vec2, i64 1 1674 %conv2 = zext i16 %s2.elt1 to i32 1675 %mul1 = mul i32 %conv2, %conv 1676 1677 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 1678 %conv3 = zext i16 %s1.elt2 to i32 1679 %s2.elt2 = extractelement <2 x i16> %vec2, i64 0 1680 %conv4 = zext i16 %s2.elt2 to i32 1681 %mul2 = mul i32 %conv4, %conv3 1682 1683 %s3 = load i32, i32 addrspace(1)* %dst, align 4 1684 %add = add i32 %mul2, %s3 1685 %add6 = add i32 %add, %mul1 1686 store i32 %add6, i32 addrspace(1)* %dst, align 4 1687 ret void 1688} 1689 1690define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1, 1691; GFX7-LABEL: udot2_MultipleUses_add1: 1692; GFX7: ; %bb.0: ; %entry 1693; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1694; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1695; GFX7-NEXT: s_mov_b32 s3, 0xf000 1696; GFX7-NEXT: s_mov_b32 s10, 0 1697; GFX7-NEXT: s_mov_b32 s11, s3 1698; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1699; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 1700; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1701; GFX7-NEXT: v_mov_b32_e32 v1, 0 1702; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 1703; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 1704; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1705; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 1706; GFX7-NEXT: s_mov_b32 s4, 0xffff 1707; GFX7-NEXT: s_mov_b32 s2, -1 1708; GFX7-NEXT: s_waitcnt vmcnt(1) 1709; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 1710; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 1711; GFX7-NEXT: s_waitcnt vmcnt(0) 1712; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 1713; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 1714; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1715; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s5 1716; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 1717; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1718; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1719; GFX7-NEXT: s_endpgm 1720; 1721; GFX8-LABEL: udot2_MultipleUses_add1: 1722; GFX8: ; %bb.0: ; %entry 1723; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1724; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1725; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1726; GFX8-NEXT: s_mov_b32 s2, 0xffff 1727; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1728; GFX8-NEXT: v_mov_b32_e32 v1, s5 1729; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1730; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1731; GFX8-NEXT: flat_load_dword v3, v[0:1] 1732; GFX8-NEXT: v_mov_b32_e32 v1, s7 1733; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 1734; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1735; GFX8-NEXT: flat_load_dword v0, v[0:1] 1736; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 1737; GFX8-NEXT: s_waitcnt vmcnt(1) 1738; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 1739; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1740; GFX8-NEXT: s_waitcnt vmcnt(0) 1741; GFX8-NEXT: v_and_b32_e32 v2, s2, v0 1742; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1743; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1744; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s3 1745; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, v0 1746; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0 1747; GFX8-NEXT: v_mov_b32_e32 v0, s0 1748; GFX8-NEXT: v_mov_b32_e32 v1, s1 1749; GFX8-NEXT: flat_store_dword v[0:1], v2 1750; GFX8-NEXT: s_endpgm 1751; 1752; GFX9-NODL-LABEL: udot2_MultipleUses_add1: 1753; GFX9-NODL: ; %bb.0: ; %entry 1754; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1755; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1756; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1757; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1758; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 1759; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 1760; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 1761; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1762; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1763; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1764; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1765; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1766; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1767; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 1768; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1 1769; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 1770; GFX9-NODL-NEXT: s_endpgm 1771; 1772; GFX9-DL-LABEL: udot2_MultipleUses_add1: 1773; GFX9-DL: ; %bb.0: ; %entry 1774; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1775; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1776; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1777; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1778; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 1779; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 1780; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 1781; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1782; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1783; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1784; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1785; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1786; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1787; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 1788; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1 1789; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 1790; GFX9-DL-NEXT: s_endpgm 1791; 1792; GFX10-DL-LABEL: udot2_MultipleUses_add1: 1793; GFX10-DL: ; %bb.0: ; %entry 1794; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1795; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1796; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1797; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1798; GFX10-DL-NEXT: s_clause 0x1 1799; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 1800; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 1801; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 1802; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 1803; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v1 1804; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1805; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 1806; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1807; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 1808; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1809; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s2 1810; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v0 1811; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] 1812; GFX10-DL-NEXT: s_endpgm 1813 <2 x i16> addrspace(1)* %src2, 1814 i32 addrspace(1)* nocapture %dst) { 1815entry: 1816 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1817 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 1818 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 1819 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 1820 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 1821 1822 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 1823 %conv = zext i16 %s1.elt1 to i32 1824 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 1825 %conv2 = zext i16 %s2.elt1 to i32 1826 %mul1 = mul i32 %conv2, %conv 1827 1828 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 1829 %conv3 = zext i16 %s1.elt2 to i32 1830 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 1831 %conv4 = zext i16 %s2.elt2 to i32 1832 %mul2 = mul i32 %conv4, %conv3 1833 1834 %s3 = load i32, i32 addrspace(1)* %dst, align 4 1835 %add1 = add i32 %mul2, %s3 1836 %add2 = add i32 %add1, %mul1 1837 1838 %res = add i32 %add2, %add1 1839 store i32 %res, i32 addrspace(1)* %dst, align 4 1840 ret void 1841} 1842 1843define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1, 1844; GFX7-LABEL: idot2_MultipleUses_add1: 1845; GFX7: ; %bb.0: ; %entry 1846; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1847; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1848; GFX7-NEXT: s_mov_b32 s3, 0xf000 1849; GFX7-NEXT: s_mov_b32 s10, 0 1850; GFX7-NEXT: s_mov_b32 s11, s3 1851; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1852; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 1853; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1854; GFX7-NEXT: v_mov_b32_e32 v1, 0 1855; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 1856; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 1857; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1858; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 1859; GFX7-NEXT: s_mov_b32 s2, -1 1860; GFX7-NEXT: s_waitcnt vmcnt(1) 1861; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16 1862; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 1863; GFX7-NEXT: s_waitcnt vmcnt(0) 1864; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16 1865; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 1866; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1867; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s4 1868; GFX7-NEXT: v_mad_i32_i24 v1, v3, v1, v0 1869; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0 1870; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1871; GFX7-NEXT: s_endpgm 1872; 1873; GFX8-LABEL: idot2_MultipleUses_add1: 1874; GFX8: ; %bb.0: ; %entry 1875; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1876; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1877; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1878; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1879; GFX8-NEXT: v_mov_b32_e32 v1, s5 1880; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1881; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1882; GFX8-NEXT: flat_load_dword v3, v[0:1] 1883; GFX8-NEXT: v_mov_b32_e32 v1, s7 1884; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 1885; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1886; GFX8-NEXT: flat_load_dword v0, v[0:1] 1887; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 1888; GFX8-NEXT: s_waitcnt vmcnt(1) 1889; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 1890; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 1891; GFX8-NEXT: s_waitcnt vmcnt(0) 1892; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 1893; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 1894; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1895; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s2 1896; GFX8-NEXT: v_mad_i32_i24 v1, v2, v1, v0 1897; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0 1898; GFX8-NEXT: v_mov_b32_e32 v0, s0 1899; GFX8-NEXT: v_mov_b32_e32 v1, s1 1900; GFX8-NEXT: flat_store_dword v[0:1], v2 1901; GFX8-NEXT: s_endpgm 1902; 1903; GFX9-NODL-LABEL: idot2_MultipleUses_add1: 1904; GFX9-NODL: ; %bb.0: ; %entry 1905; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1906; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1907; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1908; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1909; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 1910; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 1911; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 1912; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1913; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1914; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1915; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1 1916; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 1917; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1918; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 1919; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1 1920; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 1921; GFX9-NODL-NEXT: s_endpgm 1922; 1923; GFX9-DL-LABEL: idot2_MultipleUses_add1: 1924; GFX9-DL: ; %bb.0: ; %entry 1925; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1926; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1927; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1928; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1929; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 1930; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 1931; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 1932; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1933; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1934; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1935; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1 1936; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 1937; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1938; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 1939; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1 1940; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 1941; GFX9-DL-NEXT: s_endpgm 1942; 1943; GFX10-DL-LABEL: idot2_MultipleUses_add1: 1944; GFX10-DL: ; %bb.0: ; %entry 1945; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1946; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1947; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1948; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1949; GFX10-DL-NEXT: s_clause 0x1 1950; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 1951; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 1952; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 1953; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 1954; GFX10-DL-NEXT: v_ashrrev_i32_e32 v0, 16, v1 1955; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1956; GFX10-DL-NEXT: v_ashrrev_i32_e32 v3, 16, v2 1957; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1958; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 1959; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1960; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s2 1961; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v0 1962; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] 1963; GFX10-DL-NEXT: s_endpgm 1964 <2 x i16> addrspace(1)* %src2, 1965 i32 addrspace(1)* nocapture %dst) { 1966entry: 1967 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1968 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 1969 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 1970 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 1971 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 1972 1973 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 1974 %conv = sext i16 %s1.elt1 to i32 1975 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 1976 %conv2 = sext i16 %s2.elt1 to i32 1977 %mul1 = mul i32 %conv2, %conv 1978 1979 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 1980 %conv3 = sext i16 %s1.elt2 to i32 1981 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 1982 %conv4 = sext i16 %s2.elt2 to i32 1983 %mul2 = mul i32 %conv4, %conv3 1984 1985 %s3 = load i32, i32 addrspace(1)* %dst, align 4 1986 %add1 = add i32 %mul2, %s3 1987 %add2 = add i32 %add1, %mul1 1988 1989 %res = add i32 %add2, %add1 1990 store i32 %res, i32 addrspace(1)* %dst, align 4 1991 ret void 1992} 1993 1994define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1, 1995; GFX7-LABEL: udot2_MultipleUses_mul1: 1996; GFX7: ; %bb.0: ; %entry 1997; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1998; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1999; GFX7-NEXT: s_mov_b32 s3, 0xf000 2000; GFX7-NEXT: s_mov_b32 s10, 0 2001; GFX7-NEXT: s_mov_b32 s11, s3 2002; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2003; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 2004; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2005; GFX7-NEXT: v_mov_b32_e32 v1, 0 2006; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 2007; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 2008; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2009; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 2010; GFX7-NEXT: s_mov_b32 s4, 0xffff 2011; GFX7-NEXT: s_mov_b32 s2, -1 2012; GFX7-NEXT: s_waitcnt vmcnt(1) 2013; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 2014; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 2015; GFX7-NEXT: s_waitcnt vmcnt(0) 2016; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 2017; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 2018; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2019; GFX7-NEXT: v_mad_u32_u24 v4, v0, v2, s5 2020; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4 2021; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 2022; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 2023; GFX7-NEXT: s_endpgm 2024; 2025; GFX8-LABEL: udot2_MultipleUses_mul1: 2026; GFX8: ; %bb.0: ; %entry 2027; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2028; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2029; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2030; GFX8-NEXT: s_mov_b32 s2, 0xffff 2031; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2032; GFX8-NEXT: v_mov_b32_e32 v1, s5 2033; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 2034; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2035; GFX8-NEXT: flat_load_dword v3, v[0:1] 2036; GFX8-NEXT: v_mov_b32_e32 v1, s7 2037; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 2038; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2039; GFX8-NEXT: flat_load_dword v0, v[0:1] 2040; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 2041; GFX8-NEXT: s_waitcnt vmcnt(1) 2042; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 2043; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 2044; GFX8-NEXT: s_waitcnt vmcnt(0) 2045; GFX8-NEXT: v_and_b32_e32 v2, s2, v0 2046; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2047; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2048; GFX8-NEXT: v_mad_u32_u24 v4, v2, v1, s3 2049; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4 2050; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 2051; GFX8-NEXT: v_mov_b32_e32 v0, s0 2052; GFX8-NEXT: v_mov_b32_e32 v1, s1 2053; GFX8-NEXT: flat_store_dword v[0:1], v2 2054; GFX8-NEXT: s_endpgm 2055; 2056; GFX9-NODL-LABEL: udot2_MultipleUses_mul1: 2057; GFX9-NODL: ; %bb.0: ; %entry 2058; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2059; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2060; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2061; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff 2062; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2063; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 2064; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 2065; GFX9-NODL-NEXT: s_load_dword s1, s[2:3], 0x0 2066; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2067; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 2068; GFX9-NODL-NEXT: v_and_b32_e32 v3, s0, v1 2069; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2070; GFX9-NODL-NEXT: v_and_b32_e32 v4, s0, v2 2071; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2072; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v2, v4, v3 2073; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2074; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v3, s1 2075; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v2 2076; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 2077; GFX9-NODL-NEXT: s_endpgm 2078; 2079; GFX9-DL-LABEL: udot2_MultipleUses_mul1: 2080; GFX9-DL: ; %bb.0: ; %entry 2081; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2082; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2083; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2084; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff 2085; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2086; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 2087; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 2088; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 2089; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2090; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2091; GFX9-DL-NEXT: v_and_b32_e32 v3, s0, v1 2092; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2093; GFX9-DL-NEXT: v_and_b32_e32 v4, s0, v2 2094; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2095; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, v4, v3 2096; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2097; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, v3, s1 2098; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v2 2099; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 2100; GFX9-DL-NEXT: s_endpgm 2101; 2102; GFX10-DL-LABEL: udot2_MultipleUses_mul1: 2103; GFX10-DL: ; %bb.0: ; %entry 2104; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2105; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2106; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2107; GFX10-DL-NEXT: s_mov_b32 s3, 0xffff 2108; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2109; GFX10-DL-NEXT: s_clause 0x1 2110; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 2111; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 2112; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 2113; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2114; GFX10-DL-NEXT: v_and_b32_e32 v0, s3, v1 2115; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2116; GFX10-DL-NEXT: v_and_b32_e32 v3, s3, v2 2117; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2118; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v0 2119; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2120; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s2 2121; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 2122; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, v2 2123; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] 2124; GFX10-DL-NEXT: s_endpgm 2125 <2 x i16> addrspace(1)* %src2, 2126 i32 addrspace(1)* nocapture %dst) { 2127entry: 2128 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2129 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 2130 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 2131 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 2132 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 2133 2134 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 2135 %conv = zext i16 %s1.elt1 to i32 2136 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 2137 %conv2 = zext i16 %s2.elt1 to i32 2138 %mul1 = mul i32 %conv2, %conv 2139 2140 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 2141 %conv3 = zext i16 %s1.elt2 to i32 2142 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 2143 %conv4 = zext i16 %s2.elt2 to i32 2144 %mul2 = mul i32 %conv4, %conv3 2145 2146 %s3 = load i32, i32 addrspace(1)* %dst, align 4 2147 %add0 = add i32 %mul1, %s3 2148 2149 %add1 = add i32 %mul2, %add0 2150 %add2 = add i32 %add1, %mul1 2151 2152 store i32 %add2, i32 addrspace(1)* %dst, align 4 2153 ret void 2154} 2155 2156define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1, 2157; GFX7-LABEL: idot2_MultipleUses_mul1: 2158; GFX7: ; %bb.0: ; %entry 2159; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2160; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2161; GFX7-NEXT: s_mov_b32 s3, 0xf000 2162; GFX7-NEXT: s_mov_b32 s10, 0 2163; GFX7-NEXT: s_mov_b32 s11, s3 2164; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2165; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 2166; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2167; GFX7-NEXT: v_mov_b32_e32 v1, 0 2168; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 2169; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 2170; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2171; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 2172; GFX7-NEXT: s_mov_b32 s2, -1 2173; GFX7-NEXT: s_waitcnt vmcnt(1) 2174; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16 2175; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 2176; GFX7-NEXT: s_waitcnt vmcnt(0) 2177; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16 2178; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2179; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2180; GFX7-NEXT: v_mad_i32_i24 v4, v3, v1, s4 2181; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v4 2182; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0 2183; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 2184; GFX7-NEXT: s_endpgm 2185; 2186; GFX8-LABEL: idot2_MultipleUses_mul1: 2187; GFX8: ; %bb.0: ; %entry 2188; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2189; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2190; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2191; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2192; GFX8-NEXT: v_mov_b32_e32 v1, s5 2193; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 2194; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2195; GFX8-NEXT: flat_load_dword v3, v[0:1] 2196; GFX8-NEXT: v_mov_b32_e32 v1, s7 2197; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 2198; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2199; GFX8-NEXT: flat_load_dword v0, v[0:1] 2200; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 2201; GFX8-NEXT: s_waitcnt vmcnt(1) 2202; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 2203; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 2204; GFX8-NEXT: s_waitcnt vmcnt(0) 2205; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 2206; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2207; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2208; GFX8-NEXT: v_mad_i32_i24 v4, v2, v1, s2 2209; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v4 2210; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 2211; GFX8-NEXT: v_mov_b32_e32 v0, s0 2212; GFX8-NEXT: v_mov_b32_e32 v1, s1 2213; GFX8-NEXT: flat_store_dword v[0:1], v2 2214; GFX8-NEXT: s_endpgm 2215; 2216; GFX9-NODL-LABEL: idot2_MultipleUses_mul1: 2217; GFX9-NODL: ; %bb.0: ; %entry 2218; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2219; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2220; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2221; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2222; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 2223; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 2224; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 2225; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2226; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 2227; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 16 2228; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2229; GFX9-NODL-NEXT: v_bfe_i32 v4, v2, 0, 16 2230; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2231; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v2, v4, v3 2232; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2233; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v4, v3, s0 2234; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v2 2235; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 2236; GFX9-NODL-NEXT: s_endpgm 2237; 2238; GFX9-DL-LABEL: idot2_MultipleUses_mul1: 2239; GFX9-DL: ; %bb.0: ; %entry 2240; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2241; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2242; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2243; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2244; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 2245; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 2246; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 2247; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2248; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2249; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 16 2250; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2251; GFX9-DL-NEXT: v_bfe_i32 v4, v2, 0, 16 2252; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2253; GFX9-DL-NEXT: v_mul_i32_i24_e32 v2, v4, v3 2254; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2255; GFX9-DL-NEXT: v_mad_i32_i24 v3, v4, v3, s0 2256; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v2 2257; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 2258; GFX9-DL-NEXT: s_endpgm 2259; 2260; GFX10-DL-LABEL: idot2_MultipleUses_mul1: 2261; GFX10-DL: ; %bb.0: ; %entry 2262; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2263; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2264; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2265; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2266; GFX10-DL-NEXT: s_clause 0x1 2267; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 2268; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 2269; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 2270; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2271; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 16 2272; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2273; GFX10-DL-NEXT: v_bfe_i32 v3, v2, 0, 16 2274; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2275; GFX10-DL-NEXT: v_mul_i32_i24_e32 v2, v3, v0 2276; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2277; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s2 2278; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 2279; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, v2 2280; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] 2281; GFX10-DL-NEXT: s_endpgm 2282 <2 x i16> addrspace(1)* %src2, 2283 i32 addrspace(1)* nocapture %dst) { 2284entry: 2285 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2286 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 2287 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 2288 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 2289 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 2290 2291 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 2292 %conv = sext i16 %s1.elt1 to i32 2293 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 2294 %conv2 = sext i16 %s2.elt1 to i32 2295 %mul1 = mul i32 %conv2, %conv 2296 2297 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 2298 %conv3 = sext i16 %s1.elt2 to i32 2299 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 2300 %conv4 = sext i16 %s2.elt2 to i32 2301 %mul2 = mul i32 %conv4, %conv3 2302 2303 %s3 = load i32, i32 addrspace(1)* %dst, align 4 2304 %add0 = add i32 %mul1, %s3 2305 2306 %add1 = add i32 %mul2, %add0 2307 %add2 = add i32 %add1, %mul1 2308 2309 store i32 %add2, i32 addrspace(1)* %dst, align 4 2310 ret void 2311} 2312 2313define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1, 2314; GFX7-LABEL: udot2_MultipleUses_mul2: 2315; GFX7: ; %bb.0: ; %entry 2316; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2317; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2318; GFX7-NEXT: s_mov_b32 s3, 0xf000 2319; GFX7-NEXT: s_mov_b32 s10, 0 2320; GFX7-NEXT: s_mov_b32 s11, s3 2321; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2322; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 2323; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2324; GFX7-NEXT: v_mov_b32_e32 v1, 0 2325; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 2326; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 2327; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2328; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 2329; GFX7-NEXT: s_mov_b32 s4, 0xffff 2330; GFX7-NEXT: s_mov_b32 s2, -1 2331; GFX7-NEXT: s_waitcnt vmcnt(1) 2332; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 2333; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 2334; GFX7-NEXT: s_waitcnt vmcnt(0) 2335; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 2336; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2337; GFX7-NEXT: v_mad_u32_u24 v4, v3, v1, s5 2338; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 2339; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4 2340; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 2341; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 2342; GFX7-NEXT: s_endpgm 2343; 2344; GFX8-LABEL: udot2_MultipleUses_mul2: 2345; GFX8: ; %bb.0: ; %entry 2346; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2347; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2348; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2349; GFX8-NEXT: s_mov_b32 s2, 0xffff 2350; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2351; GFX8-NEXT: v_mov_b32_e32 v1, s5 2352; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 2353; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2354; GFX8-NEXT: flat_load_dword v3, v[0:1] 2355; GFX8-NEXT: v_mov_b32_e32 v1, s7 2356; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 2357; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2358; GFX8-NEXT: flat_load_dword v0, v[0:1] 2359; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 2360; GFX8-NEXT: s_waitcnt vmcnt(1) 2361; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 2362; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 2363; GFX8-NEXT: s_waitcnt vmcnt(0) 2364; GFX8-NEXT: v_and_b32_e32 v2, s2, v0 2365; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2366; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2367; GFX8-NEXT: v_mad_u32_u24 v4, v0, v3, s3 2368; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4 2369; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 2370; GFX8-NEXT: v_mov_b32_e32 v0, s0 2371; GFX8-NEXT: v_mov_b32_e32 v1, s1 2372; GFX8-NEXT: flat_store_dword v[0:1], v2 2373; GFX8-NEXT: s_endpgm 2374; 2375; GFX9-NODL-LABEL: udot2_MultipleUses_mul2: 2376; GFX9-NODL: ; %bb.0: ; %entry 2377; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2378; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2379; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2380; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2381; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 2382; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 2383; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 2384; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2385; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2386; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 2387; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2388; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 2389; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v4, v2, v1 2390; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2391; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 2392; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3 2393; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 2394; GFX9-NODL-NEXT: s_endpgm 2395; 2396; GFX9-DL-LABEL: udot2_MultipleUses_mul2: 2397; GFX9-DL: ; %bb.0: ; %entry 2398; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2399; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2400; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2401; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2402; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 2403; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 2404; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 2405; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2406; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2407; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 2408; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2409; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 2410; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v2, v1 2411; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2412; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 2413; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3 2414; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 2415; GFX9-DL-NEXT: s_endpgm 2416; 2417; GFX10-DL-LABEL: udot2_MultipleUses_mul2: 2418; GFX10-DL: ; %bb.0: ; %entry 2419; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2420; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2421; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2422; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2423; GFX10-DL-NEXT: s_clause 0x1 2424; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 2425; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 2426; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 2427; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2428; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v1 2429; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2430; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 2431; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 2432; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v0 2433; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2434; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s2 2435; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 2436; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1 2437; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] 2438; GFX10-DL-NEXT: s_endpgm 2439 <2 x i16> addrspace(1)* %src2, 2440 i32 addrspace(1)* nocapture %dst) { 2441entry: 2442 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2443 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 2444 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 2445 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 2446 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 2447 2448 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 2449 %conv = zext i16 %s1.elt1 to i32 2450 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 2451 %conv2 = zext i16 %s2.elt1 to i32 2452 %mul1 = mul i32 %conv2, %conv 2453 2454 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 2455 %conv3 = zext i16 %s1.elt2 to i32 2456 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 2457 %conv4 = zext i16 %s2.elt2 to i32 2458 %mul2 = mul i32 %conv4, %conv3 2459 2460 %s3 = load i32, i32 addrspace(1)* %dst, align 4 2461 %add0 = add i32 %mul2, %s3 2462 2463 %add1 = add i32 %mul2, %add0 2464 %add2 = add i32 %add1, %mul1 2465 2466 store i32 %add2, i32 addrspace(1)* %dst, align 4 2467 ret void 2468} 2469 2470define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1, 2471; GFX7-LABEL: idot2_MultipleUses_mul2: 2472; GFX7: ; %bb.0: ; %entry 2473; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2474; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2475; GFX7-NEXT: s_mov_b32 s3, 0xf000 2476; GFX7-NEXT: s_mov_b32 s10, 0 2477; GFX7-NEXT: s_mov_b32 s11, s3 2478; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2479; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 2480; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2481; GFX7-NEXT: v_mov_b32_e32 v1, 0 2482; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 2483; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 2484; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2485; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 2486; GFX7-NEXT: s_mov_b32 s2, -1 2487; GFX7-NEXT: s_waitcnt vmcnt(1) 2488; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16 2489; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 2490; GFX7-NEXT: s_waitcnt vmcnt(0) 2491; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16 2492; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2493; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2494; GFX7-NEXT: v_mad_i32_i24 v4, v0, v2, s4 2495; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v4 2496; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0 2497; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 2498; GFX7-NEXT: s_endpgm 2499; 2500; GFX8-LABEL: idot2_MultipleUses_mul2: 2501; GFX8: ; %bb.0: ; %entry 2502; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2503; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2504; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2505; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2506; GFX8-NEXT: v_mov_b32_e32 v1, s5 2507; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 2508; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2509; GFX8-NEXT: flat_load_dword v3, v[0:1] 2510; GFX8-NEXT: v_mov_b32_e32 v1, s7 2511; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 2512; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2513; GFX8-NEXT: flat_load_dword v0, v[0:1] 2514; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 2515; GFX8-NEXT: s_waitcnt vmcnt(1) 2516; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 2517; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 2518; GFX8-NEXT: s_waitcnt vmcnt(0) 2519; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 2520; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2521; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2522; GFX8-NEXT: v_mad_i32_i24 v4, v0, v3, s2 2523; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v4 2524; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 2525; GFX8-NEXT: v_mov_b32_e32 v0, s0 2526; GFX8-NEXT: v_mov_b32_e32 v1, s1 2527; GFX8-NEXT: flat_store_dword v[0:1], v2 2528; GFX8-NEXT: s_endpgm 2529; 2530; GFX9-NODL-LABEL: idot2_MultipleUses_mul2: 2531; GFX9-NODL: ; %bb.0: ; %entry 2532; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2533; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2534; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2535; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2536; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 2537; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 2538; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 2539; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2540; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2541; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 2542; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1 2543; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 2544; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v4, v2, v1 2545; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2546; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 2547; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3 2548; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 2549; GFX9-NODL-NEXT: s_endpgm 2550; 2551; GFX9-DL-LABEL: idot2_MultipleUses_mul2: 2552; GFX9-DL: ; %bb.0: ; %entry 2553; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2554; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2555; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2556; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2557; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 2558; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 2559; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 2560; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2561; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2562; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 2563; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1 2564; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 2565; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, v2, v1 2566; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2567; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 2568; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3 2569; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 2570; GFX9-DL-NEXT: s_endpgm 2571; 2572; GFX10-DL-LABEL: idot2_MultipleUses_mul2: 2573; GFX10-DL: ; %bb.0: ; %entry 2574; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2575; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2576; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2577; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2578; GFX10-DL-NEXT: s_clause 0x1 2579; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 2580; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 2581; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 2582; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2583; GFX10-DL-NEXT: v_ashrrev_i32_e32 v0, 16, v1 2584; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2585; GFX10-DL-NEXT: v_ashrrev_i32_e32 v3, 16, v2 2586; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 2587; GFX10-DL-NEXT: v_mul_i32_i24_e32 v2, v3, v0 2588; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2589; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s2 2590; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 2591; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1 2592; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] 2593; GFX10-DL-NEXT: s_endpgm 2594 <2 x i16> addrspace(1)* %src2, 2595 i32 addrspace(1)* nocapture %dst) { 2596entry: 2597 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2598 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 2599 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 2600 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 2601 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 2602 2603 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 2604 %conv = sext i16 %s1.elt1 to i32 2605 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 2606 %conv2 = sext i16 %s2.elt1 to i32 2607 %mul1 = mul i32 %conv2, %conv 2608 2609 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 2610 %conv3 = sext i16 %s1.elt2 to i32 2611 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 2612 %conv4 = sext i16 %s2.elt2 to i32 2613 %mul2 = mul i32 %conv4, %conv3 2614 2615 %s3 = load i32, i32 addrspace(1)* %dst, align 4 2616 %add0 = add i32 %mul2, %s3 2617 2618 %add1 = add i32 %mul2, %add0 2619 %add2 = add i32 %add1, %mul1 2620 2621 store i32 %add2, i32 addrspace(1)* %dst, align 4 2622 ret void 2623} 2624 2625define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1, 2626; GFX7-LABEL: udot2_acc16: 2627; GFX7: ; %bb.0: ; %entry 2628; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2629; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2630; GFX7-NEXT: s_mov_b32 s3, 0xf000 2631; GFX7-NEXT: s_mov_b32 s10, 0 2632; GFX7-NEXT: s_mov_b32 s11, s3 2633; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2634; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 2635; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2636; GFX7-NEXT: v_mov_b32_e32 v1, 0 2637; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 2638; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 2639; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2640; GFX7-NEXT: s_mov_b32 s2, -1 2641; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 2642; GFX7-NEXT: s_mov_b32 s4, 0xffff 2643; GFX7-NEXT: s_waitcnt vmcnt(2) 2644; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 2645; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 2646; GFX7-NEXT: s_waitcnt vmcnt(1) 2647; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 2648; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 2649; GFX7-NEXT: s_waitcnt vmcnt(0) 2650; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1 2651; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 2652; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 2653; GFX7-NEXT: s_endpgm 2654; 2655; GFX8-LABEL: udot2_acc16: 2656; GFX8: ; %bb.0: ; %entry 2657; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2658; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2659; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2660; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2661; GFX8-NEXT: v_mov_b32_e32 v1, s5 2662; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 2663; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2664; GFX8-NEXT: flat_load_dword v3, v[0:1] 2665; GFX8-NEXT: v_mov_b32_e32 v1, s7 2666; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 2667; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2668; GFX8-NEXT: flat_load_dword v2, v[0:1] 2669; GFX8-NEXT: v_mov_b32_e32 v0, s0 2670; GFX8-NEXT: v_mov_b32_e32 v1, s1 2671; GFX8-NEXT: flat_load_ushort v4, v[0:1] 2672; GFX8-NEXT: s_waitcnt vmcnt(2) 2673; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 2674; GFX8-NEXT: s_waitcnt vmcnt(1) 2675; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 2676; GFX8-NEXT: s_waitcnt vmcnt(0) 2677; GFX8-NEXT: v_mad_u16 v4, v5, v6, v4 2678; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 2679; GFX8-NEXT: flat_store_short v[0:1], v2 2680; GFX8-NEXT: s_endpgm 2681; 2682; GFX9-NODL-LABEL: udot2_acc16: 2683; GFX9-NODL: ; %bb.0: ; %entry 2684; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2685; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2686; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2687; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0 2688; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2689; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5] 2690; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7] 2691; GFX9-NODL-NEXT: global_load_ushort v4, v1, s[2:3] 2692; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 2693; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 2694; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 2695; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 2696; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2697; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4 2698; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0 2699; GFX9-NODL-NEXT: global_store_short v1, v0, s[2:3] 2700; GFX9-NODL-NEXT: s_endpgm 2701; 2702; GFX9-DL-LABEL: udot2_acc16: 2703; GFX9-DL: ; %bb.0: ; %entry 2704; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2705; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2706; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2707; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 2708; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2709; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] 2710; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] 2711; GFX9-DL-NEXT: global_load_ushort v4, v1, s[2:3] 2712; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 2713; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 2714; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2715; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 2716; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2717; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4 2718; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0 2719; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3] 2720; GFX9-DL-NEXT: s_endpgm 2721; 2722; GFX10-DL-LABEL: udot2_acc16: 2723; GFX10-DL: ; %bb.0: ; %entry 2724; GFX10-DL-NEXT: s_clause 0x1 2725; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2726; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2727; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2728; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 2729; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2730; GFX10-DL-NEXT: s_clause 0x1 2731; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] 2732; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] 2733; GFX10-DL-NEXT: global_load_ushort v4, v1, s[2:3] 2734; GFX10-DL-NEXT: s_waitcnt vmcnt(2) 2735; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 2736; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2737; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 2738; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2739; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 2740; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 2741; GFX10-DL-NEXT: global_store_short v1, v0, s[2:3] 2742; GFX10-DL-NEXT: s_endpgm 2743 <2 x i16> addrspace(1)* %src2, 2744 i16 addrspace(1)* nocapture %dst) { 2745entry: 2746 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2747 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 2748 %v1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 2749 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 2750 %v2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 2751 2752 %v1e1 = extractelement <2 x i16> %v1, i64 0 2753 %v2e1 = extractelement <2 x i16> %v2, i64 0 2754 %mul1 = mul i16 %v1e1, %v2e1 2755 2756 %v1e2 = extractelement <2 x i16> %v1, i64 1 2757 %v2e2 = extractelement <2 x i16> %v2, i64 1 2758 %mul2 = mul i16 %v1e2, %v2e2 2759 2760 %s2 = load i16, i16 addrspace(1)* %dst, align 2 2761 %add1 = add i16 %mul2, %s2 2762 %add2 = add i16 %add1, %mul1 2763 store i16 %add2, i16 addrspace(1)* %dst, align 2 2764 ret void 2765} 2766 2767define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1, 2768; GFX7-LABEL: notsdot2_sext8: 2769; GFX7: ; %bb.0: ; %entry 2770; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2771; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2772; GFX7-NEXT: s_mov_b32 s3, 0xf000 2773; GFX7-NEXT: s_mov_b32 s10, 0 2774; GFX7-NEXT: s_mov_b32 s11, s3 2775; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2776; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 2777; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0 2778; GFX7-NEXT: v_mov_b32_e32 v1, 0 2779; GFX7-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 2780; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 2781; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 2782; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 2783; GFX7-NEXT: s_mov_b32 s2, -1 2784; GFX7-NEXT: s_waitcnt vmcnt(1) 2785; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 2786; GFX7-NEXT: v_bfe_i32 v2, v2, 8, 8 2787; GFX7-NEXT: s_waitcnt vmcnt(0) 2788; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 8 2789; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8 2790; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2791; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s4 2792; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0 2793; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 2794; GFX7-NEXT: s_endpgm 2795; 2796; GFX8-LABEL: notsdot2_sext8: 2797; GFX8: ; %bb.0: ; %entry 2798; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2799; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2800; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 2801; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2802; GFX8-NEXT: v_mov_b32_e32 v1, s5 2803; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 2804; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2805; GFX8-NEXT: flat_load_ushort v3, v[0:1] 2806; GFX8-NEXT: v_mov_b32_e32 v1, s7 2807; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 2808; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2809; GFX8-NEXT: flat_load_ushort v0, v[0:1] 2810; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 2811; GFX8-NEXT: s_waitcnt vmcnt(1) 2812; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 2813; GFX8-NEXT: v_lshrrev_b16_e32 v3, 8, v3 2814; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 2815; GFX8-NEXT: s_waitcnt vmcnt(0) 2816; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8 2817; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 2818; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 2819; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2820; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s2 2821; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 2822; GFX8-NEXT: v_mov_b32_e32 v0, s0 2823; GFX8-NEXT: v_mov_b32_e32 v1, s1 2824; GFX8-NEXT: flat_store_dword v[0:1], v2 2825; GFX8-NEXT: s_endpgm 2826; 2827; GFX9-NODL-LABEL: notsdot2_sext8: 2828; GFX9-NODL: ; %bb.0: ; %entry 2829; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2830; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2831; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 2832; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2833; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[4:5] 2834; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[6:7] 2835; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 2836; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2837; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2838; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 2839; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v1, 8, v1 2840; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 2841; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 2842; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2843; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 2844; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 2845; GFX9-NODL-NEXT: s_endpgm 2846; 2847; GFX9-DL-LABEL: notsdot2_sext8: 2848; GFX9-DL: ; %bb.0: ; %entry 2849; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2850; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2851; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 2852; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2853; GFX9-DL-NEXT: global_load_ushort v1, v0, s[4:5] 2854; GFX9-DL-NEXT: global_load_ushort v2, v0, s[6:7] 2855; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 2856; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2857; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2858; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 2859; GFX9-DL-NEXT: v_lshrrev_b16_e32 v1, 8, v1 2860; GFX9-DL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 2861; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 2862; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2863; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3 2864; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 2865; GFX9-DL-NEXT: s_endpgm 2866; 2867; GFX10-DL-LABEL: notsdot2_sext8: 2868; GFX10-DL: ; %bb.0: ; %entry 2869; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2870; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 2871; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2872; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2873; GFX10-DL-NEXT: s_clause 0x1 2874; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] 2875; GFX10-DL-NEXT: global_load_ushort v2, v0, s[6:7] 2876; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 2877; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2878; GFX10-DL-NEXT: v_lshrrev_b16 v0, 8, v1 2879; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2880; GFX10-DL-NEXT: v_lshrrev_b16 v3, 8, v2 2881; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 2882; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 2883; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v3), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 2884; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2885; GFX10-DL-NEXT: v_add3_u32 v0, v0, s2, v1 2886; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] 2887; GFX10-DL-NEXT: s_endpgm 2888 <2 x i8> addrspace(1)* %src2, 2889 i32 addrspace(1)* nocapture %dst) { 2890entry: 2891 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2892 %gep1 = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %src1, i32 %idx 2893 %vec1 = load <2 x i8>, <2 x i8> addrspace(1)* %gep1 2894 %gep2 = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %src2, i32 %idx 2895 %vec2 = load <2 x i8>, <2 x i8> addrspace(1)* %gep2 2896 2897 %s1.elt1 = extractelement <2 x i8> %vec1, i64 0 2898 %conv = sext i8 %s1.elt1 to i32 2899 %s2.elt1 = extractelement <2 x i8> %vec2, i64 0 2900 %conv2 = sext i8 %s2.elt1 to i32 2901 %mul1 = mul nuw i32 %conv2, %conv 2902 2903 %s1.elt2 = extractelement <2 x i8> %vec1, i64 1 2904 %conv3 = sext i8 %s1.elt2 to i32 2905 %s2.elt2 = extractelement <2 x i8> %vec2, i64 1 2906 %conv4 = sext i8 %s2.elt2 to i32 2907 %mul2 = mul nuw i32 %conv4, %conv3 2908 2909 %s3 = load i32, i32 addrspace(1)* %dst, align 4 2910 %add = add i32 %mul2, %s3 2911 %add6 = add i32 %add, %mul1 2912 store i32 %add6, i32 addrspace(1)* %dst, align 4 2913 ret void 2914} 2915 2916declare i32 @llvm.amdgcn.workitem.id.x() 2917