1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NODL %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-DL %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-DL %s 7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-DL %s 8 9; add(mul(S0.x, S1.y), 10; add (mul (S0.y, S1.y), S3)) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3) 11 12define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1, 13; GFX7-LABEL: udot2: 14; GFX7: ; %bb.0: ; %entry 15; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 16; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 17; GFX7-NEXT: s_mov_b32 s3, 0xf000 18; GFX7-NEXT: s_mov_b32 s10, 0 19; GFX7-NEXT: s_mov_b32 s11, s3 20; GFX7-NEXT: s_waitcnt lgkmcnt(0) 21; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 22; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 23; GFX7-NEXT: v_mov_b32_e32 v1, 0 24; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 25; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 26; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 27; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 28; GFX7-NEXT: s_mov_b32 s2, -1 29; GFX7-NEXT: s_waitcnt vmcnt(1) 30; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 31; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 32; GFX7-NEXT: s_waitcnt vmcnt(0) 33; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 34; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 35; GFX7-NEXT: s_waitcnt lgkmcnt(0) 36; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s4 37; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 38; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 39; GFX7-NEXT: s_endpgm 40; 41; GFX8-LABEL: udot2: 42; GFX8: ; %bb.0: ; %entry 43; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 44; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 45; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 46; GFX8-NEXT: s_waitcnt lgkmcnt(0) 47; GFX8-NEXT: v_mov_b32_e32 v1, s5 48; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 49; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 50; GFX8-NEXT: flat_load_dword v3, v[0:1] 51; GFX8-NEXT: v_mov_b32_e32 v1, s7 52; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 53; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 54; GFX8-NEXT: flat_load_dword v0, v[0:1] 55; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 56; GFX8-NEXT: s_waitcnt vmcnt(1) 57; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 58; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 59; GFX8-NEXT: s_waitcnt vmcnt(0) 60; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 61; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 62; GFX8-NEXT: s_waitcnt lgkmcnt(0) 63; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2 64; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 65; GFX8-NEXT: v_mov_b32_e32 v0, s0 66; GFX8-NEXT: v_mov_b32_e32 v1, s1 67; GFX8-NEXT: flat_store_dword v[0:1], v2 68; GFX8-NEXT: s_endpgm 69; 70; GFX9-NODL-LABEL: udot2: 71; GFX9-NODL: ; %bb.0: ; %entry 72; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 73; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 74; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 75; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 76; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 77; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 78; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 79; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 80; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 81; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 82; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 83; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 84; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 85; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 86; GFX9-NODL-NEXT: s_endpgm 87; 88; GFX9-DL-LABEL: udot2: 89; GFX9-DL: ; %bb.0: ; %entry 90; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 91; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 92; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 93; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 94; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 95; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 96; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 97; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 98; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 99; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 100; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 101; GFX9-DL-NEXT: s_endpgm 102; 103; GFX10-DL-LABEL: udot2: 104; GFX10-DL: ; %bb.0: ; %entry 105; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 106; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 107; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 108; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 109; GFX10-DL-NEXT: s_clause 0x1 110; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 111; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 112; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 113; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 114; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 115; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 116; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] 117; GFX10-DL-NEXT: s_endpgm 118 <2 x i16> addrspace(1)* %src2, 119 i32 addrspace(1)* nocapture %dst) { 120entry: 121 %idx = call i32 @llvm.amdgcn.workitem.id.x() 122 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 123 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 124 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 125 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 126 127 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 128 %conv = zext i16 %s1.elt1 to i32 129 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 130 %conv2 = zext i16 %s2.elt1 to i32 131 %mul1 = mul nuw i32 %conv2, %conv 132 133 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 134 %conv3 = zext i16 %s1.elt2 to i32 135 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 136 %conv4 = zext i16 %s2.elt2 to i32 137 %mul2 = mul nuw i32 %conv4, %conv3 138 139 %s3 = load i32, i32 addrspace(1)* %dst, align 4 140 %add = add i32 %mul2, %s3 141 %add6 = add i32 %add, %mul1 142 store i32 %add6, i32 addrspace(1)* %dst, align 4 143 ret void 144} 145 146; TODO: Support this pattern 147; add(S3, 148; add (mul (S0.y, S1.y), mul (S0.y, S1.y))) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3) 149define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1, 150; GFX7-LABEL: udot2_MulMul: 151; GFX7: ; %bb.0: ; %entry 152; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 153; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 154; GFX7-NEXT: s_mov_b32 s3, 0xf000 155; GFX7-NEXT: s_mov_b32 s10, 0 156; GFX7-NEXT: s_mov_b32 s11, s3 157; GFX7-NEXT: s_waitcnt lgkmcnt(0) 158; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 159; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 160; GFX7-NEXT: v_mov_b32_e32 v1, 0 161; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 162; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 163; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 164; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 165; GFX7-NEXT: s_mov_b32 s2, -1 166; GFX7-NEXT: s_waitcnt vmcnt(1) 167; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 168; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 169; GFX7-NEXT: s_waitcnt vmcnt(0) 170; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 171; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 172; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v2 173; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0 174; GFX7-NEXT: s_waitcnt lgkmcnt(0) 175; GFX7-NEXT: v_add_i32_e32 v0, vcc, s4, v0 176; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 177; GFX7-NEXT: s_endpgm 178; 179; GFX8-LABEL: udot2_MulMul: 180; GFX8: ; %bb.0: ; %entry 181; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 182; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 183; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 184; GFX8-NEXT: s_waitcnt lgkmcnt(0) 185; GFX8-NEXT: v_mov_b32_e32 v1, s5 186; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 187; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 188; GFX8-NEXT: flat_load_dword v3, v[0:1] 189; GFX8-NEXT: v_mov_b32_e32 v1, s7 190; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 191; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 192; GFX8-NEXT: flat_load_dword v0, v[0:1] 193; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 194; GFX8-NEXT: s_waitcnt vmcnt(1) 195; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 196; GFX8-NEXT: s_waitcnt vmcnt(0) 197; GFX8-NEXT: v_mul_u32_u24_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 198; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 199; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, v1 200; GFX8-NEXT: s_waitcnt lgkmcnt(0) 201; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0 202; GFX8-NEXT: v_mov_b32_e32 v0, s0 203; GFX8-NEXT: v_mov_b32_e32 v1, s1 204; GFX8-NEXT: flat_store_dword v[0:1], v2 205; GFX8-NEXT: s_endpgm 206; 207; GFX9-NODL-LABEL: udot2_MulMul: 208; GFX9-NODL: ; %bb.0: ; %entry 209; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 210; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 211; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 212; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 213; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 214; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 215; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 216; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 217; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 218; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 219; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 220; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 221; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, s0 222; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 223; GFX9-NODL-NEXT: s_endpgm 224; 225; GFX9-DL-LABEL: udot2_MulMul: 226; GFX9-DL: ; %bb.0: ; %entry 227; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 228; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 229; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 230; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 231; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 232; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 233; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 234; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 235; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 236; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 237; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 238; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 239; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, s0 240; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 241; GFX9-DL-NEXT: s_endpgm 242; 243; GFX10-DL-LABEL: udot2_MulMul: 244; GFX10-DL: ; %bb.0: ; %entry 245; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 246; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 247; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 248; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 249; GFX10-DL-NEXT: s_clause 0x1 250; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 251; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 252; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 253; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 254; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 255; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 256; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 257; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 258; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, s2 259; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] 260; GFX10-DL-NEXT: s_endpgm 261 <2 x i16> addrspace(1)* %src2, 262 i32 addrspace(1)* nocapture %dst) { 263entry: 264 %idx = call i32 @llvm.amdgcn.workitem.id.x() 265 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 266 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 267 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 268 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 269 270 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 271 %conv = zext i16 %s1.elt1 to i32 272 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 273 %conv2 = zext i16 %s2.elt1 to i32 274 %mul1 = mul nuw i32 %conv2, %conv 275 276 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 277 %conv3 = zext i16 %s1.elt2 to i32 278 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 279 %conv4 = zext i16 %s2.elt2 to i32 280 %mul2 = mul nuw i32 %conv4, %conv3 281 %s3 = load i32, i32 addrspace(1)* %dst, align 4 282 %add = add i32 %mul2, %mul1 283 %add6 = add i32 %add, %s3 284 store i32 %add6, i32 addrspace(1)* %dst, align 4 285 ret void 286} 287 288define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1, 289; GFX7-LABEL: idot2: 290; GFX7: ; %bb.0: ; %entry 291; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 292; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 293; GFX7-NEXT: s_mov_b32 s3, 0xf000 294; GFX7-NEXT: s_mov_b32 s10, 0 295; GFX7-NEXT: s_mov_b32 s11, s3 296; GFX7-NEXT: s_waitcnt lgkmcnt(0) 297; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 298; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 299; GFX7-NEXT: v_mov_b32_e32 v1, 0 300; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 301; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 302; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 303; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 304; GFX7-NEXT: s_mov_b32 s2, -1 305; GFX7-NEXT: s_waitcnt vmcnt(1) 306; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16 307; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 308; GFX7-NEXT: s_waitcnt vmcnt(0) 309; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16 310; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 311; GFX7-NEXT: s_waitcnt lgkmcnt(0) 312; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s4 313; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0 314; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 315; GFX7-NEXT: s_endpgm 316; 317; GFX8-LABEL: idot2: 318; GFX8: ; %bb.0: ; %entry 319; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 320; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 321; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 322; GFX8-NEXT: s_waitcnt lgkmcnt(0) 323; GFX8-NEXT: v_mov_b32_e32 v1, s5 324; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 325; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 326; GFX8-NEXT: flat_load_dword v3, v[0:1] 327; GFX8-NEXT: v_mov_b32_e32 v1, s7 328; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 329; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 330; GFX8-NEXT: flat_load_dword v0, v[0:1] 331; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 332; GFX8-NEXT: s_waitcnt vmcnt(1) 333; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 334; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 335; GFX8-NEXT: s_waitcnt vmcnt(0) 336; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 337; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 338; GFX8-NEXT: s_waitcnt lgkmcnt(0) 339; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s2 340; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 341; GFX8-NEXT: v_mov_b32_e32 v0, s0 342; GFX8-NEXT: v_mov_b32_e32 v1, s1 343; GFX8-NEXT: flat_store_dword v[0:1], v2 344; GFX8-NEXT: s_endpgm 345; 346; GFX9-NODL-LABEL: idot2: 347; GFX9-NODL: ; %bb.0: ; %entry 348; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 349; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 350; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 351; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 352; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 353; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 354; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 355; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 356; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 357; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 358; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 359; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 360; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 361; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 362; GFX9-NODL-NEXT: s_endpgm 363; 364; GFX9-DL-LABEL: idot2: 365; GFX9-DL: ; %bb.0: ; %entry 366; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 367; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 368; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 369; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 370; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 371; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 372; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 373; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 374; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 375; GFX9-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s0 376; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 377; GFX9-DL-NEXT: s_endpgm 378; 379; GFX10-DL-LABEL: idot2: 380; GFX10-DL: ; %bb.0: ; %entry 381; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 382; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 383; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 384; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 385; GFX10-DL-NEXT: s_clause 0x1 386; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 387; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 388; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 389; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 390; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 391; GFX10-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s2 392; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] 393; GFX10-DL-NEXT: s_endpgm 394 <2 x i16> addrspace(1)* %src2, 395 i32 addrspace(1)* nocapture %dst) { 396entry: 397 %idx = call i32 @llvm.amdgcn.workitem.id.x() 398 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 399 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 400 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 401 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 402 403 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 404 %conv = sext i16 %s1.elt1 to i32 405 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 406 %conv2 = sext i16 %s2.elt1 to i32 407 %mul1 = mul nuw i32 %conv2, %conv 408 409 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 410 %conv3 = sext i16 %s1.elt2 to i32 411 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 412 %conv4 = sext i16 %s2.elt2 to i32 413 %mul2 = mul nuw i32 %conv4, %conv3 414 415 %s3 = load i32, i32 addrspace(1)* %dst, align 4 416 %add = add i32 %mul2, %s3 417 %add6 = add i32 %add, %mul1 418 store i32 %add6, i32 addrspace(1)* %dst, align 4 419 ret void 420} 421 422define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1, 423; GFX7-LABEL: idot2_MixedTypedMul: 424; GFX7: ; %bb.0: ; %entry 425; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 426; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 427; GFX7-NEXT: s_mov_b32 s3, 0xf000 428; GFX7-NEXT: s_mov_b32 s10, 0 429; GFX7-NEXT: s_mov_b32 s11, s3 430; GFX7-NEXT: s_waitcnt lgkmcnt(0) 431; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 432; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 433; GFX7-NEXT: v_mov_b32_e32 v1, 0 434; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 435; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 436; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 437; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 438; GFX7-NEXT: s_mov_b32 s2, -1 439; GFX7-NEXT: s_waitcnt vmcnt(1) 440; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 441; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16 442; GFX7-NEXT: s_waitcnt vmcnt(0) 443; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 444; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 445; GFX7-NEXT: s_waitcnt lgkmcnt(0) 446; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s4 447; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v1 448; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 449; GFX7-NEXT: s_endpgm 450; 451; GFX8-LABEL: idot2_MixedTypedMul: 452; GFX8: ; %bb.0: ; %entry 453; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 454; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 455; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 456; GFX8-NEXT: s_waitcnt lgkmcnt(0) 457; GFX8-NEXT: v_mov_b32_e32 v1, s5 458; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 459; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 460; GFX8-NEXT: flat_load_dword v3, v[0:1] 461; GFX8-NEXT: v_mov_b32_e32 v1, s7 462; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 463; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 464; GFX8-NEXT: flat_load_dword v0, v[0:1] 465; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 466; GFX8-NEXT: s_waitcnt vmcnt(1) 467; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 468; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 469; GFX8-NEXT: s_waitcnt vmcnt(0) 470; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 471; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 472; GFX8-NEXT: s_waitcnt lgkmcnt(0) 473; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2 474; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 475; GFX8-NEXT: v_mov_b32_e32 v0, s0 476; GFX8-NEXT: v_mov_b32_e32 v1, s1 477; GFX8-NEXT: flat_store_dword v[0:1], v2 478; GFX8-NEXT: s_endpgm 479; 480; GFX9-NODL-LABEL: idot2_MixedTypedMul: 481; GFX9-NODL: ; %bb.0: ; %entry 482; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 483; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 484; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 485; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 486; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 487; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 488; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 489; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 490; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 491; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 492; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 493; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 494; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 495; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 496; GFX9-NODL-NEXT: s_endpgm 497; 498; GFX9-DL-LABEL: idot2_MixedTypedMul: 499; GFX9-DL: ; %bb.0: ; %entry 500; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 501; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 502; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 503; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 504; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 505; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 506; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 507; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 508; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 509; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 510; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 511; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 512; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3 513; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 514; GFX9-DL-NEXT: s_endpgm 515; 516; GFX10-DL-LABEL: idot2_MixedTypedMul: 517; GFX10-DL: ; %bb.0: ; %entry 518; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 519; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 520; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 521; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 522; GFX10-DL-NEXT: s_clause 0x1 523; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 524; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 525; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 526; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 527; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 528; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 529; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 530; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 531; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0 532; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] 533; GFX10-DL-NEXT: s_endpgm 534 <2 x i16> addrspace(1)* %src2, 535 i32 addrspace(1)* nocapture %dst) { 536entry: 537 %idx = call i32 @llvm.amdgcn.workitem.id.x() 538 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 539 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 540 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 541 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 542 543 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 544 %conv = sext i16 %s1.elt1 to i32 545 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 546 %conv2 = sext i16 %s2.elt1 to i32 547 %mul1 = mul nuw i32 %conv2, %conv 548 549 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 550 %conv3 = zext i16 %s1.elt2 to i32 551 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 552 %conv4 = zext i16 %s2.elt2 to i32 553 %mul2 = mul nuw i32 %conv4, %conv3 554 555 %s3 = load i32, i32 addrspace(1)* %dst, align 4 556 %add = add i32 %mul2, %s3 557 %add6 = add i32 %add, %mul1 558 store i32 %add6, i32 addrspace(1)* %dst, align 4 559 ret void 560} 561 562define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1, 563; GFX7-LABEL: udot2_alt_AddOperands: 564; GFX7: ; %bb.0: ; %entry 565; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 566; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 567; GFX7-NEXT: s_mov_b32 s3, 0xf000 568; GFX7-NEXT: s_mov_b32 s10, 0 569; GFX7-NEXT: s_mov_b32 s11, s3 570; GFX7-NEXT: s_waitcnt lgkmcnt(0) 571; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 572; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 573; GFX7-NEXT: v_mov_b32_e32 v1, 0 574; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 575; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 576; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 577; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 578; GFX7-NEXT: s_mov_b32 s2, -1 579; GFX7-NEXT: s_waitcnt vmcnt(1) 580; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 581; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 582; GFX7-NEXT: s_waitcnt vmcnt(0) 583; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 584; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 585; GFX7-NEXT: s_waitcnt lgkmcnt(0) 586; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s4 587; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 588; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 589; GFX7-NEXT: s_endpgm 590; 591; GFX8-LABEL: udot2_alt_AddOperands: 592; GFX8: ; %bb.0: ; %entry 593; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 594; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 595; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 596; GFX8-NEXT: s_waitcnt lgkmcnt(0) 597; GFX8-NEXT: v_mov_b32_e32 v1, s5 598; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 599; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 600; GFX8-NEXT: flat_load_dword v3, v[0:1] 601; GFX8-NEXT: v_mov_b32_e32 v1, s7 602; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 603; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 604; GFX8-NEXT: flat_load_dword v0, v[0:1] 605; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 606; GFX8-NEXT: s_waitcnt vmcnt(1) 607; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 608; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 609; GFX8-NEXT: s_waitcnt vmcnt(0) 610; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 611; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 612; GFX8-NEXT: s_waitcnt lgkmcnt(0) 613; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2 614; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 615; GFX8-NEXT: v_mov_b32_e32 v0, s0 616; GFX8-NEXT: v_mov_b32_e32 v1, s1 617; GFX8-NEXT: flat_store_dword v[0:1], v2 618; GFX8-NEXT: s_endpgm 619; 620; GFX9-NODL-LABEL: udot2_alt_AddOperands: 621; GFX9-NODL: ; %bb.0: ; %entry 622; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 623; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 624; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 625; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 626; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 627; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 628; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 629; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 630; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 631; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v1 632; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 633; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xffff, v2 634; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 635; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 636; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 637; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 638; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v4, v3, v1 639; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 640; GFX9-NODL-NEXT: s_endpgm 641; 642; GFX9-DL-LABEL: udot2_alt_AddOperands: 643; GFX9-DL: ; %bb.0: ; %entry 644; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 645; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 646; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 647; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 648; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 649; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 650; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 651; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 652; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 653; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 654; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 655; GFX9-DL-NEXT: s_endpgm 656; 657; GFX10-DL-LABEL: udot2_alt_AddOperands: 658; GFX10-DL: ; %bb.0: ; %entry 659; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 660; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 661; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 662; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 663; GFX10-DL-NEXT: s_clause 0x1 664; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 665; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 666; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 667; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 668; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 669; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 670; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] 671; GFX10-DL-NEXT: s_endpgm 672 <2 x i16> addrspace(1)* %src2, 673 i32 addrspace(1)* nocapture %dst) { 674entry: 675 %idx = call i32 @llvm.amdgcn.workitem.id.x() 676 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 677 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 678 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 679 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 680 681 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 682 %conv = zext i16 %s1.elt1 to i32 683 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 684 %conv2 = zext i16 %s2.elt1 to i32 685 %mul1 = mul nuw i32 %conv2, %conv 686 687 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 688 %conv3 = zext i16 %s1.elt2 to i32 689 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 690 %conv4 = zext i16 %s2.elt2 to i32 691 %mul2 = mul nuw i32 %conv4, %conv3 692 693 %s3 = load i32, i32 addrspace(1)* %dst, align 4 694 %add = add i32 %s3, %mul2 695 %add6 = add i32 %mul1, %add 696 store i32 %add6, i32 addrspace(1)* %dst, align 4 697 ret void 698} 699 700define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1, 701; GFX7-LABEL: idot2_MixedExt: 702; GFX7: ; %bb.0: ; %entry 703; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 704; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 705; GFX7-NEXT: s_mov_b32 s3, 0xf000 706; GFX7-NEXT: s_mov_b32 s10, 0 707; GFX7-NEXT: s_mov_b32 s11, s3 708; GFX7-NEXT: s_waitcnt lgkmcnt(0) 709; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 710; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 711; GFX7-NEXT: v_mov_b32_e32 v1, 0 712; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 713; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 714; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 715; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 716; GFX7-NEXT: s_mov_b32 s2, -1 717; GFX7-NEXT: s_waitcnt vmcnt(1) 718; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16 719; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 720; GFX7-NEXT: s_waitcnt vmcnt(0) 721; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0 722; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 723; GFX7-NEXT: s_waitcnt lgkmcnt(0) 724; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s4 725; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0 726; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 727; GFX7-NEXT: s_endpgm 728; 729; GFX8-LABEL: idot2_MixedExt: 730; GFX8: ; %bb.0: ; %entry 731; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 732; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 733; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 734; GFX8-NEXT: s_waitcnt lgkmcnt(0) 735; GFX8-NEXT: v_mov_b32_e32 v1, s5 736; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 737; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 738; GFX8-NEXT: flat_load_dword v3, v[0:1] 739; GFX8-NEXT: v_mov_b32_e32 v1, s7 740; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 741; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 742; GFX8-NEXT: flat_load_dword v0, v[0:1] 743; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 744; GFX8-NEXT: s_waitcnt vmcnt(1) 745; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 746; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 747; GFX8-NEXT: s_waitcnt vmcnt(0) 748; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 749; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 750; GFX8-NEXT: s_waitcnt lgkmcnt(0) 751; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s2 752; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 753; GFX8-NEXT: v_mov_b32_e32 v0, s0 754; GFX8-NEXT: v_mov_b32_e32 v1, s1 755; GFX8-NEXT: flat_store_dword v[0:1], v2 756; GFX8-NEXT: s_endpgm 757; 758; GFX9-NODL-LABEL: idot2_MixedExt: 759; GFX9-NODL: ; %bb.0: ; %entry 760; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 761; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 762; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 763; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 764; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 765; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 766; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 767; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 768; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 769; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 770; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 771; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 772; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 773; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 774; GFX9-NODL-NEXT: s_endpgm 775; 776; GFX9-DL-LABEL: idot2_MixedExt: 777; GFX9-DL: ; %bb.0: ; %entry 778; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 779; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 780; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 781; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 782; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 783; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 784; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 785; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 786; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 787; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 788; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 789; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 790; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3 791; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 792; GFX9-DL-NEXT: s_endpgm 793; 794; GFX10-DL-LABEL: idot2_MixedExt: 795; GFX10-DL: ; %bb.0: ; %entry 796; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 797; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 798; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 799; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 800; GFX10-DL-NEXT: s_clause 0x1 801; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 802; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 803; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 804; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 805; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 806; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 807; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 808; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 809; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0 810; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] 811; GFX10-DL-NEXT: s_endpgm 812 <2 x i16> addrspace(1)* %src2, 813 i32 addrspace(1)* nocapture %dst) { 814entry: 815 %idx = call i32 @llvm.amdgcn.workitem.id.x() 816 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 817 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 818 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 819 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 820 821 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 822 %conv = sext i16 %s1.elt1 to i32 823 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 824 %conv2 = zext i16 %s2.elt1 to i32 825 %mul1 = mul nuw i32 %conv2, %conv 826 827 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 828 %conv3 = sext i16 %s1.elt2 to i32 829 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 830 %conv4 = sext i16 %s2.elt2 to i32 831 %mul2 = mul nuw i32 %conv4, %conv3 832 833 %s3 = load i32, i32 addrspace(1)* %dst, align 4 834 %add = add i32 %mul2, %s3 835 %add6 = add i32 %add, %mul1 836 store i32 %add6, i32 addrspace(1)* %dst, align 4 837 ret void 838} 839 840define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1, 841; GFX7-LABEL: notudot2_SameVec: 842; GFX7: ; %bb.0: ; %entry 843; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 844; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 845; GFX7-NEXT: s_mov_b32 s3, 0xf000 846; GFX7-NEXT: s_mov_b32 s10, 0 847; GFX7-NEXT: s_mov_b32 s11, s3 848; GFX7-NEXT: s_waitcnt lgkmcnt(0) 849; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 850; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 851; GFX7-NEXT: v_mov_b32_e32 v1, 0 852; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 853; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 854; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 855; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 856; GFX7-NEXT: s_mov_b32 s2, -1 857; GFX7-NEXT: s_waitcnt vmcnt(1) 858; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 859; GFX7-NEXT: s_waitcnt vmcnt(0) 860; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 861; GFX7-NEXT: s_waitcnt lgkmcnt(0) 862; GFX7-NEXT: v_mad_u32_u24 v0, v0, v0, s4 863; GFX7-NEXT: v_mad_u32_u24 v0, v1, v1, v0 864; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 865; GFX7-NEXT: s_endpgm 866; 867; GFX8-LABEL: notudot2_SameVec: 868; GFX8: ; %bb.0: ; %entry 869; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 870; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 871; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 872; GFX8-NEXT: s_waitcnt lgkmcnt(0) 873; GFX8-NEXT: v_mov_b32_e32 v1, s5 874; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 875; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 876; GFX8-NEXT: flat_load_dword v3, v[0:1] 877; GFX8-NEXT: v_mov_b32_e32 v1, s7 878; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 879; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 880; GFX8-NEXT: flat_load_dword v0, v[0:1] 881; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 882; GFX8-NEXT: s_waitcnt vmcnt(1) 883; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 884; GFX8-NEXT: s_waitcnt vmcnt(0) 885; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 886; GFX8-NEXT: s_waitcnt lgkmcnt(0) 887; GFX8-NEXT: v_mad_u32_u24 v0, v0, v0, s2 888; GFX8-NEXT: v_mad_u32_u24 v2, v1, v1, v0 889; GFX8-NEXT: v_mov_b32_e32 v0, s0 890; GFX8-NEXT: v_mov_b32_e32 v1, s1 891; GFX8-NEXT: flat_store_dword v[0:1], v2 892; GFX8-NEXT: s_endpgm 893; 894; GFX9-NODL-LABEL: notudot2_SameVec: 895; GFX9-NODL: ; %bb.0: ; %entry 896; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 897; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 898; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 899; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 900; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 901; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 902; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 903; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 904; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 905; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 906; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 907; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 908; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 909; GFX9-NODL-NEXT: v_add3_u32 v1, v2, s0, v1 910; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 911; GFX9-NODL-NEXT: s_endpgm 912; 913; GFX9-DL-LABEL: notudot2_SameVec: 914; GFX9-DL: ; %bb.0: ; %entry 915; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 916; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 917; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 918; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 919; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 920; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 921; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 922; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 923; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 924; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 925; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 926; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 927; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 928; GFX9-DL-NEXT: v_add3_u32 v1, v2, s0, v1 929; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 930; GFX9-DL-NEXT: s_endpgm 931; 932; GFX10-DL-LABEL: notudot2_SameVec: 933; GFX10-DL: ; %bb.0: ; %entry 934; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 935; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 936; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 937; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 938; GFX10-DL-NEXT: s_clause 0x1 939; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 940; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 941; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 942; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 943; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 944; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 945; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 946; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 947; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 948; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0 949; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] 950; GFX10-DL-NEXT: s_endpgm 951 <2 x i16> addrspace(1)* %src2, 952 i32 addrspace(1)* nocapture %dst) { 953entry: 954 %idx = call i32 @llvm.amdgcn.workitem.id.x() 955 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 956 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 957 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 958 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 959 960 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 961 %conv = zext i16 %s1.elt1 to i32 962 %s2.elt1 = extractelement <2 x i16> %vec1, i64 0 963 %conv2 = zext i16 %s2.elt1 to i32 964 %mul1 = mul i32 %conv2, %conv 965 966 %s1.elt2 = extractelement <2 x i16> %vec2, i64 1 967 %conv3 = zext i16 %s1.elt2 to i32 968 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 969 %conv4 = zext i16 %s2.elt2 to i32 970 %mul2 = mul i32 %conv4, %conv3 971 972 %s3 = load i32, i32 addrspace(1)* %dst, align 4 973 %add = add i32 %mul2, %s3 974 %add6 = add i32 %add, %mul1 975 store i32 %add6, i32 addrspace(1)* %dst, align 4 976 ret void 977} 978 979define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1, 980; GFX7-LABEL: udot2_v4i16: 981; GFX7: ; %bb.0: ; %entry 982; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 983; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 984; GFX7-NEXT: s_mov_b32 s3, 0xf000 985; GFX7-NEXT: s_mov_b32 s10, 0 986; GFX7-NEXT: s_mov_b32 s11, s3 987; GFX7-NEXT: s_waitcnt lgkmcnt(0) 988; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 989; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 990; GFX7-NEXT: v_mov_b32_e32 v1, 0 991; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] 992; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] 993; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 994; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 995; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 996; GFX7-NEXT: s_mov_b32 s2, -1 997; GFX7-NEXT: s_waitcnt vmcnt(1) 998; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 999; GFX7-NEXT: s_waitcnt vmcnt(0) 1000; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0 1001; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1002; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1003; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1004; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, s4 1005; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0 1006; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1007; GFX7-NEXT: s_endpgm 1008; 1009; GFX8-LABEL: udot2_v4i16: 1010; GFX8: ; %bb.0: ; %entry 1011; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1012; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1013; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1014; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1015; GFX8-NEXT: v_mov_b32_e32 v1, s5 1016; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1017; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1018; GFX8-NEXT: v_mov_b32_e32 v3, s7 1019; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2 1020; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1021; GFX8-NEXT: flat_load_dword v0, v[0:1] 1022; GFX8-NEXT: flat_load_dword v1, v[2:3] 1023; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 1024; GFX8-NEXT: s_waitcnt vmcnt(1) 1025; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 1026; GFX8-NEXT: s_waitcnt vmcnt(0) 1027; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v1 1028; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1029; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1030; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1031; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s2 1032; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v0 1033; GFX8-NEXT: v_mov_b32_e32 v0, s0 1034; GFX8-NEXT: v_mov_b32_e32 v1, s1 1035; GFX8-NEXT: flat_store_dword v[0:1], v2 1036; GFX8-NEXT: s_endpgm 1037; 1038; GFX9-NODL-LABEL: udot2_v4i16: 1039; GFX9-NODL: ; %bb.0: ; %entry 1040; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1041; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1042; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1043; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1044; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 1045; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 1046; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 1047; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1048; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1049; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1050; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1051; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1052; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 1053; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 1054; GFX9-NODL-NEXT: s_endpgm 1055; 1056; GFX9-DL-LABEL: udot2_v4i16: 1057; GFX9-DL: ; %bb.0: ; %entry 1058; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1059; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1060; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1061; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1062; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 1063; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 1064; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 1065; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1066; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1067; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 1068; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 1069; GFX9-DL-NEXT: s_endpgm 1070; 1071; GFX10-DL-LABEL: udot2_v4i16: 1072; GFX10-DL: ; %bb.0: ; %entry 1073; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1074; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1075; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1076; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1077; GFX10-DL-NEXT: s_clause 0x1 1078; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 1079; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 1080; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 1081; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 1082; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1083; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 1084; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] 1085; GFX10-DL-NEXT: s_endpgm 1086 <4 x i16> addrspace(1)* %src2, 1087 i32 addrspace(1)* nocapture %dst) { 1088entry: 1089 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1090 %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src1, i32 %idx 1091 %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %gep1 1092 %gep2 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src2, i32 %idx 1093 %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %gep2 1094 1095 %s1.elt1 = extractelement <4 x i16> %vec1, i64 0 1096 %conv = zext i16 %s1.elt1 to i32 1097 %s2.elt1 = extractelement <4 x i16> %vec2, i64 0 1098 %conv2 = zext i16 %s2.elt1 to i32 1099 %mul1 = mul i32 %conv2, %conv 1100 1101 %s1.elt2 = extractelement <4 x i16> %vec1, i64 1 1102 %conv3 = zext i16 %s1.elt2 to i32 1103 %s2.elt2 = extractelement <4 x i16> %vec2, i64 1 1104 %conv4 = zext i16 %s2.elt2 to i32 1105 %mul2 = mul i32 %conv4, %conv3 1106 1107 %s3 = load i32, i32 addrspace(1)* %dst, align 4 1108 %add = add i32 %mul2, %s3 1109 %add6 = add i32 %add, %mul1 1110 store i32 %add6, i32 addrspace(1)* %dst, align 4 1111 ret void 1112} 1113 1114define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1, 1115; GFX7-LABEL: udot2_v4i16_Hi: 1116; GFX7: ; %bb.0: ; %entry 1117; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1118; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1119; GFX7-NEXT: s_mov_b32 s3, 0xf000 1120; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1121; GFX7-NEXT: v_mov_b32_e32 v1, 0 1122; GFX7-NEXT: s_mov_b32 s10, 0 1123; GFX7-NEXT: s_mov_b32 s11, s3 1124; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1125; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 1126; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 1127; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 1128; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4 1129; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 1130; GFX7-NEXT: s_mov_b32 s2, -1 1131; GFX7-NEXT: s_waitcnt vmcnt(1) 1132; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 1133; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1134; GFX7-NEXT: s_waitcnt vmcnt(0) 1135; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0 1136; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1137; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1138; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, s4 1139; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0 1140; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1141; GFX7-NEXT: s_endpgm 1142; 1143; GFX8-LABEL: udot2_v4i16_Hi: 1144; GFX8: ; %bb.0: ; %entry 1145; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1146; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1147; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1148; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1149; GFX8-NEXT: v_mov_b32_e32 v1, s5 1150; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v0 1151; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1152; GFX8-NEXT: v_mov_b32_e32 v3, s7 1153; GFX8-NEXT: v_add_u32_e32 v4, vcc, s6, v0 1154; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1155; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2 1156; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1157; GFX8-NEXT: flat_load_dword v2, v[0:1] 1158; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 1159; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 1160; GFX8-NEXT: flat_load_dword v0, v[0:1] 1161; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 1162; GFX8-NEXT: s_waitcnt vmcnt(1) 1163; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v2 1164; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1165; GFX8-NEXT: s_waitcnt vmcnt(0) 1166; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v0 1167; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1168; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1169; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, s2 1170; GFX8-NEXT: v_mad_u32_u24 v2, v3, v1, v0 1171; GFX8-NEXT: v_mov_b32_e32 v0, s0 1172; GFX8-NEXT: v_mov_b32_e32 v1, s1 1173; GFX8-NEXT: flat_store_dword v[0:1], v2 1174; GFX8-NEXT: s_endpgm 1175; 1176; GFX9-NODL-LABEL: udot2_v4i16_Hi: 1177; GFX9-NODL: ; %bb.0: ; %entry 1178; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1179; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1180; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1181; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1182; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 1183; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] offset:4 1184; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 1185; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1186; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1187; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1188; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1189; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1190; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 1191; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 1192; GFX9-NODL-NEXT: s_endpgm 1193; 1194; GFX9-DL-LABEL: udot2_v4i16_Hi: 1195; GFX9-DL: ; %bb.0: ; %entry 1196; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1197; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1198; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1199; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1200; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 1201; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] offset:4 1202; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 1203; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1204; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1205; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 1206; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 1207; GFX9-DL-NEXT: s_endpgm 1208; 1209; GFX10-DL-LABEL: udot2_v4i16_Hi: 1210; GFX10-DL: ; %bb.0: ; %entry 1211; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1212; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1213; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1214; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1215; GFX10-DL-NEXT: s_clause 0x1 1216; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 1217; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] offset:4 1218; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 1219; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 1220; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1221; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 1222; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] 1223; GFX10-DL-NEXT: s_endpgm 1224 <4 x i16> addrspace(1)* %src2, 1225 i32 addrspace(1)* nocapture %dst) { 1226entry: 1227 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1228 %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src1, i32 %idx 1229 %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %gep1 1230 %gep2 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src2, i32 %idx 1231 %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %gep2 1232 1233 %s1.elt1 = extractelement <4 x i16> %vec1, i64 2 1234 %conv = zext i16 %s1.elt1 to i32 1235 %s2.elt1 = extractelement <4 x i16> %vec2, i64 2 1236 %conv2 = zext i16 %s2.elt1 to i32 1237 %mul1 = mul i32 %conv2, %conv 1238 1239 %s1.elt2 = extractelement <4 x i16> %vec1, i64 3 1240 %conv3 = zext i16 %s1.elt2 to i32 1241 %s2.elt2 = extractelement <4 x i16> %vec2, i64 3 1242 %conv4 = zext i16 %s2.elt2 to i32 1243 %mul2 = mul i32 %conv4, %conv3 1244 1245 %s3 = load i32, i32 addrspace(1)* %dst, align 4 1246 %add = add i32 %mul2, %s3 1247 %add6 = add i32 %add, %mul1 1248 store i32 %add6, i32 addrspace(1)* %dst, align 4 1249 ret void 1250} 1251 1252define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1, 1253; GFX7-LABEL: notudot2_v4i16_Even: 1254; GFX7: ; %bb.0: ; %entry 1255; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1256; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1257; GFX7-NEXT: s_mov_b32 s3, 0xf000 1258; GFX7-NEXT: s_mov_b32 s10, 0 1259; GFX7-NEXT: s_mov_b32 s11, s3 1260; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1261; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 1262; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1263; GFX7-NEXT: v_mov_b32_e32 v1, 0 1264; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] 1265; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] 1266; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 1267; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 1268; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 1269; GFX7-NEXT: s_mov_b32 s2, -1 1270; GFX7-NEXT: s_waitcnt vmcnt(1) 1271; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 1272; GFX7-NEXT: s_waitcnt vmcnt(0) 1273; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 1274; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 1275; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 1276; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1277; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, s4 1278; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 1279; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1280; GFX7-NEXT: s_endpgm 1281; 1282; GFX8-LABEL: notudot2_v4i16_Even: 1283; GFX8: ; %bb.0: ; %entry 1284; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1285; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1286; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1287; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1288; GFX8-NEXT: v_mov_b32_e32 v1, s5 1289; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1290; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1291; GFX8-NEXT: v_mov_b32_e32 v3, s7 1292; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2 1293; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1294; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1295; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 1296; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 1297; GFX8-NEXT: s_waitcnt vmcnt(1) 1298; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 1299; GFX8-NEXT: s_waitcnt vmcnt(0) 1300; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v3 1301; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 1302; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 1303; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1304; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s2 1305; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1 1306; GFX8-NEXT: v_mov_b32_e32 v0, s0 1307; GFX8-NEXT: v_mov_b32_e32 v1, s1 1308; GFX8-NEXT: flat_store_dword v[0:1], v2 1309; GFX8-NEXT: s_endpgm 1310; 1311; GFX9-NODL-LABEL: notudot2_v4i16_Even: 1312; GFX9-NODL: ; %bb.0: ; %entry 1313; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1314; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1315; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1316; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1317; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] 1318; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] 1319; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 1320; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0 1321; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1322; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1323; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1324; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1325; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s0, v0 1326; GFX9-NODL-NEXT: global_store_dword v4, v0, s[2:3] 1327; GFX9-NODL-NEXT: s_endpgm 1328; 1329; GFX9-DL-LABEL: notudot2_v4i16_Even: 1330; GFX9-DL: ; %bb.0: ; %entry 1331; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1332; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1333; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1334; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1335; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] 1336; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] 1337; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 1338; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0 1339; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1340; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1341; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1342; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1343; GFX9-DL-NEXT: v_add3_u32 v0, v1, s0, v0 1344; GFX9-DL-NEXT: global_store_dword v4, v0, s[2:3] 1345; GFX9-DL-NEXT: s_endpgm 1346; 1347; GFX10-DL-LABEL: notudot2_v4i16_Even: 1348; GFX10-DL: ; %bb.0: ; %entry 1349; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1350; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1351; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1352; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1353; GFX10-DL-NEXT: s_clause 0x1 1354; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] 1355; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] 1356; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 1357; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1358; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1359; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1360; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 1361; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1362; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0 1363; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] 1364; GFX10-DL-NEXT: s_endpgm 1365 <4 x i16> addrspace(1)* %src2, 1366 i32 addrspace(1)* nocapture %dst) { 1367entry: 1368 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1369 %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src1, i32 %idx 1370 %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %gep1 1371 %gep2 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src2, i32 %idx 1372 %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %gep2 1373 1374 %s1.elt1 = extractelement <4 x i16> %vec1, i64 0 1375 %conv = zext i16 %s1.elt1 to i32 1376 %s2.elt1 = extractelement <4 x i16> %vec2, i64 0 1377 %conv2 = zext i16 %s2.elt1 to i32 1378 %mul1 = mul i32 %conv2, %conv 1379 1380 %s1.elt2 = extractelement <4 x i16> %vec1, i64 2 1381 %conv3 = zext i16 %s1.elt2 to i32 1382 %s2.elt2 = extractelement <4 x i16> %vec2, i64 2 1383 %conv4 = zext i16 %s2.elt2 to i32 1384 %mul2 = mul i32 %conv4, %conv3 1385 1386 %s3 = load i32, i32 addrspace(1)* %dst, align 4 1387 %add = add i32 %mul2, %s3 1388 %add6 = add i32 %add, %mul1 1389 store i32 %add6, i32 addrspace(1)* %dst, align 4 1390 ret void 1391} 1392 1393define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1, 1394; GFX7-LABEL: notudot2_v4i16_Middle: 1395; GFX7: ; %bb.0: ; %entry 1396; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1397; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1398; GFX7-NEXT: s_mov_b32 s3, 0xf000 1399; GFX7-NEXT: s_mov_b32 s10, 0 1400; GFX7-NEXT: s_mov_b32 s11, s3 1401; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1402; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 1403; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1404; GFX7-NEXT: v_mov_b32_e32 v1, 0 1405; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] 1406; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] 1407; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 1408; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 1409; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 1410; GFX7-NEXT: s_mov_b32 s2, -1 1411; GFX7-NEXT: s_waitcnt vmcnt(1) 1412; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 1413; GFX7-NEXT: s_waitcnt vmcnt(0) 1414; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 1415; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1416; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1417; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1418; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, s4 1419; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 1420; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1421; GFX7-NEXT: s_endpgm 1422; 1423; GFX8-LABEL: notudot2_v4i16_Middle: 1424; GFX8: ; %bb.0: ; %entry 1425; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1426; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1427; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1428; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1429; GFX8-NEXT: v_mov_b32_e32 v1, s5 1430; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1431; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1432; GFX8-NEXT: v_mov_b32_e32 v3, s7 1433; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2 1434; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1435; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1436; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 1437; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 1438; GFX8-NEXT: s_waitcnt vmcnt(1) 1439; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 1440; GFX8-NEXT: s_waitcnt vmcnt(0) 1441; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v3 1442; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1443; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1444; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1445; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s2 1446; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1 1447; GFX8-NEXT: v_mov_b32_e32 v0, s0 1448; GFX8-NEXT: v_mov_b32_e32 v1, s1 1449; GFX8-NEXT: flat_store_dword v[0:1], v2 1450; GFX8-NEXT: s_endpgm 1451; 1452; GFX9-NODL-LABEL: notudot2_v4i16_Middle: 1453; GFX9-NODL: ; %bb.0: ; %entry 1454; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1455; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1456; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1457; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1458; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] 1459; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] 1460; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 1461; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0 1462; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1463; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1464; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1465; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1466; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s0, v0 1467; GFX9-NODL-NEXT: global_store_dword v4, v0, s[2:3] 1468; GFX9-NODL-NEXT: s_endpgm 1469; 1470; GFX9-DL-LABEL: notudot2_v4i16_Middle: 1471; GFX9-DL: ; %bb.0: ; %entry 1472; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1473; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1474; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1475; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1476; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] 1477; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] 1478; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 1479; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0 1480; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1481; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1482; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1483; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1484; GFX9-DL-NEXT: v_add3_u32 v0, v1, s0, v0 1485; GFX9-DL-NEXT: global_store_dword v4, v0, s[2:3] 1486; GFX9-DL-NEXT: s_endpgm 1487; 1488; GFX10-DL-LABEL: notudot2_v4i16_Middle: 1489; GFX10-DL: ; %bb.0: ; %entry 1490; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1491; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1492; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1493; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1494; GFX10-DL-NEXT: s_clause 0x1 1495; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] 1496; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] 1497; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 1498; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1499; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1500; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1501; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 1502; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1503; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0 1504; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] 1505; GFX10-DL-NEXT: s_endpgm 1506 <4 x i16> addrspace(1)* %src2, 1507 i32 addrspace(1)* nocapture %dst) { 1508entry: 1509 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1510 %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src1, i32 %idx 1511 %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %gep1 1512 %gep2 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src2, i32 %idx 1513 %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %gep2 1514 1515 %s1.elt1 = extractelement <4 x i16> %vec1, i64 1 1516 %conv = zext i16 %s1.elt1 to i32 1517 %s2.elt1 = extractelement <4 x i16> %vec2, i64 1 1518 %conv2 = zext i16 %s2.elt1 to i32 1519 %mul1 = mul i32 %conv2, %conv 1520 1521 %s1.elt2 = extractelement <4 x i16> %vec1, i64 2 1522 %conv3 = zext i16 %s1.elt2 to i32 1523 %s2.elt2 = extractelement <4 x i16> %vec2, i64 2 1524 %conv4 = zext i16 %s2.elt2 to i32 1525 %mul2 = mul i32 %conv4, %conv3 1526 1527 %s3 = load i32, i32 addrspace(1)* %dst, align 4 1528 %add = add i32 %mul2, %s3 1529 %add6 = add i32 %add, %mul1 1530 store i32 %add6, i32 addrspace(1)* %dst, align 4 1531 ret void 1532} 1533 1534define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1, 1535; GFX7-LABEL: notudot2_DiffIndex: 1536; GFX7: ; %bb.0: ; %entry 1537; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1538; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1539; GFX7-NEXT: s_mov_b32 s3, 0xf000 1540; GFX7-NEXT: s_mov_b32 s10, 0 1541; GFX7-NEXT: s_mov_b32 s11, s3 1542; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1543; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 1544; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1545; GFX7-NEXT: v_mov_b32_e32 v1, 0 1546; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 1547; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 1548; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1549; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 1550; GFX7-NEXT: s_mov_b32 s2, -1 1551; GFX7-NEXT: s_waitcnt vmcnt(1) 1552; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 1553; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 1554; GFX7-NEXT: s_waitcnt vmcnt(0) 1555; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 1556; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 1557; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1558; GFX7-NEXT: v_mad_u32_u24 v0, v0, v1, s4 1559; GFX7-NEXT: v_mad_u32_u24 v0, v3, v2, v0 1560; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1561; GFX7-NEXT: s_endpgm 1562; 1563; GFX8-LABEL: notudot2_DiffIndex: 1564; GFX8: ; %bb.0: ; %entry 1565; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1566; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1567; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1568; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1569; GFX8-NEXT: v_mov_b32_e32 v1, s5 1570; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1571; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1572; GFX8-NEXT: flat_load_dword v3, v[0:1] 1573; GFX8-NEXT: v_mov_b32_e32 v1, s7 1574; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 1575; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1576; GFX8-NEXT: flat_load_dword v0, v[0:1] 1577; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 1578; GFX8-NEXT: s_waitcnt vmcnt(1) 1579; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 1580; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1581; GFX8-NEXT: s_waitcnt vmcnt(0) 1582; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 1583; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 1584; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1585; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2 1586; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 1587; GFX8-NEXT: v_mov_b32_e32 v0, s0 1588; GFX8-NEXT: v_mov_b32_e32 v1, s1 1589; GFX8-NEXT: flat_store_dword v[0:1], v2 1590; GFX8-NEXT: s_endpgm 1591; 1592; GFX9-NODL-LABEL: notudot2_DiffIndex: 1593; GFX9-NODL: ; %bb.0: ; %entry 1594; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1595; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1596; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1597; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1598; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 1599; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 1600; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 1601; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1602; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1603; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0 1604; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1 1605; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1606; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 1607; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 1608; GFX9-NODL-NEXT: s_endpgm 1609; 1610; GFX9-DL-LABEL: notudot2_DiffIndex: 1611; GFX9-DL: ; %bb.0: ; %entry 1612; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1613; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1614; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1615; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1616; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 1617; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 1618; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 1619; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1620; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1621; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0 1622; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1 1623; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1624; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3 1625; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 1626; GFX9-DL-NEXT: s_endpgm 1627; 1628; GFX10-DL-LABEL: notudot2_DiffIndex: 1629; GFX10-DL: ; %bb.0: ; %entry 1630; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1631; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1632; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1633; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1634; GFX10-DL-NEXT: s_clause 0x1 1635; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 1636; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 1637; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 1638; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1639; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0 1640; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1 1641; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 1642; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1643; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0 1644; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] 1645; GFX10-DL-NEXT: s_endpgm 1646 <2 x i16> addrspace(1)* %src2, 1647 i32 addrspace(1)* nocapture %dst) { 1648entry: 1649 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1650 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 1651 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 1652 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 1653 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 1654 1655 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 1656 %conv = zext i16 %s1.elt1 to i32 1657 %s2.elt1 = extractelement <2 x i16> %vec2, i64 1 1658 %conv2 = zext i16 %s2.elt1 to i32 1659 %mul1 = mul i32 %conv2, %conv 1660 1661 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 1662 %conv3 = zext i16 %s1.elt2 to i32 1663 %s2.elt2 = extractelement <2 x i16> %vec2, i64 0 1664 %conv4 = zext i16 %s2.elt2 to i32 1665 %mul2 = mul i32 %conv4, %conv3 1666 1667 %s3 = load i32, i32 addrspace(1)* %dst, align 4 1668 %add = add i32 %mul2, %s3 1669 %add6 = add i32 %add, %mul1 1670 store i32 %add6, i32 addrspace(1)* %dst, align 4 1671 ret void 1672} 1673 1674define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1, 1675; GFX7-LABEL: udot2_MultipleUses_add1: 1676; GFX7: ; %bb.0: ; %entry 1677; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1678; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1679; GFX7-NEXT: s_mov_b32 s3, 0xf000 1680; GFX7-NEXT: s_mov_b32 s10, 0 1681; GFX7-NEXT: s_mov_b32 s11, s3 1682; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1683; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 1684; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1685; GFX7-NEXT: v_mov_b32_e32 v1, 0 1686; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 1687; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 1688; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1689; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 1690; GFX7-NEXT: s_mov_b32 s2, -1 1691; GFX7-NEXT: s_waitcnt vmcnt(1) 1692; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 1693; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 1694; GFX7-NEXT: s_waitcnt vmcnt(0) 1695; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 1696; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 1697; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1698; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s4 1699; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 1700; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1701; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1702; GFX7-NEXT: s_endpgm 1703; 1704; GFX8-LABEL: udot2_MultipleUses_add1: 1705; GFX8: ; %bb.0: ; %entry 1706; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1707; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1708; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1709; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1710; GFX8-NEXT: v_mov_b32_e32 v1, s5 1711; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1712; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1713; GFX8-NEXT: flat_load_dword v3, v[0:1] 1714; GFX8-NEXT: v_mov_b32_e32 v1, s7 1715; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 1716; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1717; GFX8-NEXT: flat_load_dword v0, v[0:1] 1718; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 1719; GFX8-NEXT: s_waitcnt vmcnt(1) 1720; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 1721; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1722; GFX8-NEXT: s_waitcnt vmcnt(0) 1723; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 1724; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1725; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1726; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2 1727; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, v0 1728; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0 1729; GFX8-NEXT: v_mov_b32_e32 v0, s0 1730; GFX8-NEXT: v_mov_b32_e32 v1, s1 1731; GFX8-NEXT: flat_store_dword v[0:1], v2 1732; GFX8-NEXT: s_endpgm 1733; 1734; GFX9-NODL-LABEL: udot2_MultipleUses_add1: 1735; GFX9-NODL: ; %bb.0: ; %entry 1736; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1737; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1738; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1739; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1740; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 1741; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 1742; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 1743; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1744; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1745; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1746; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1747; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1748; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1749; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 1750; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1 1751; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 1752; GFX9-NODL-NEXT: s_endpgm 1753; 1754; GFX9-DL-LABEL: udot2_MultipleUses_add1: 1755; GFX9-DL: ; %bb.0: ; %entry 1756; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1757; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1758; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1759; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1760; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 1761; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 1762; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 1763; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1764; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1765; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1766; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1767; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1768; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1769; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 1770; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1 1771; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 1772; GFX9-DL-NEXT: s_endpgm 1773; 1774; GFX10-DL-LABEL: udot2_MultipleUses_add1: 1775; GFX10-DL: ; %bb.0: ; %entry 1776; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1777; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1778; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1779; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1780; GFX10-DL-NEXT: s_clause 0x1 1781; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 1782; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 1783; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 1784; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 1785; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v1 1786; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1787; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 1788; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1789; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 1790; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1791; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s2 1792; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v0 1793; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] 1794; GFX10-DL-NEXT: s_endpgm 1795 <2 x i16> addrspace(1)* %src2, 1796 i32 addrspace(1)* nocapture %dst) { 1797entry: 1798 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1799 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 1800 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 1801 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 1802 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 1803 1804 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 1805 %conv = zext i16 %s1.elt1 to i32 1806 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 1807 %conv2 = zext i16 %s2.elt1 to i32 1808 %mul1 = mul i32 %conv2, %conv 1809 1810 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 1811 %conv3 = zext i16 %s1.elt2 to i32 1812 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 1813 %conv4 = zext i16 %s2.elt2 to i32 1814 %mul2 = mul i32 %conv4, %conv3 1815 1816 %s3 = load i32, i32 addrspace(1)* %dst, align 4 1817 %add1 = add i32 %mul2, %s3 1818 %add2 = add i32 %add1, %mul1 1819 1820 %res = add i32 %add2, %add1 1821 store i32 %res, i32 addrspace(1)* %dst, align 4 1822 ret void 1823} 1824 1825define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1, 1826; GFX7-LABEL: idot2_MultipleUses_add1: 1827; GFX7: ; %bb.0: ; %entry 1828; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1829; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1830; GFX7-NEXT: s_mov_b32 s3, 0xf000 1831; GFX7-NEXT: s_mov_b32 s10, 0 1832; GFX7-NEXT: s_mov_b32 s11, s3 1833; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1834; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 1835; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1836; GFX7-NEXT: v_mov_b32_e32 v1, 0 1837; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 1838; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 1839; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1840; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 1841; GFX7-NEXT: s_mov_b32 s2, -1 1842; GFX7-NEXT: s_waitcnt vmcnt(1) 1843; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16 1844; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 1845; GFX7-NEXT: s_waitcnt vmcnt(0) 1846; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16 1847; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 1848; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1849; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s4 1850; GFX7-NEXT: v_mad_i32_i24 v1, v3, v1, v0 1851; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0 1852; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1853; GFX7-NEXT: s_endpgm 1854; 1855; GFX8-LABEL: idot2_MultipleUses_add1: 1856; GFX8: ; %bb.0: ; %entry 1857; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1858; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1859; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1860; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1861; GFX8-NEXT: v_mov_b32_e32 v1, s5 1862; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1863; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1864; GFX8-NEXT: flat_load_dword v3, v[0:1] 1865; GFX8-NEXT: v_mov_b32_e32 v1, s7 1866; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 1867; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1868; GFX8-NEXT: flat_load_dword v0, v[0:1] 1869; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 1870; GFX8-NEXT: s_waitcnt vmcnt(1) 1871; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 1872; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 1873; GFX8-NEXT: s_waitcnt vmcnt(0) 1874; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 1875; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 1876; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1877; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s2 1878; GFX8-NEXT: v_mad_i32_i24 v1, v2, v1, v0 1879; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0 1880; GFX8-NEXT: v_mov_b32_e32 v0, s0 1881; GFX8-NEXT: v_mov_b32_e32 v1, s1 1882; GFX8-NEXT: flat_store_dword v[0:1], v2 1883; GFX8-NEXT: s_endpgm 1884; 1885; GFX9-NODL-LABEL: idot2_MultipleUses_add1: 1886; GFX9-NODL: ; %bb.0: ; %entry 1887; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1888; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1889; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1890; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1891; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 1892; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 1893; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 1894; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1895; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1896; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1897; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1 1898; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 1899; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1900; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 1901; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1 1902; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 1903; GFX9-NODL-NEXT: s_endpgm 1904; 1905; GFX9-DL-LABEL: idot2_MultipleUses_add1: 1906; GFX9-DL: ; %bb.0: ; %entry 1907; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1908; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1909; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1910; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1911; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 1912; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 1913; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 1914; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1915; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1916; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1917; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1 1918; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 1919; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1920; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 1921; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1 1922; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 1923; GFX9-DL-NEXT: s_endpgm 1924; 1925; GFX10-DL-LABEL: idot2_MultipleUses_add1: 1926; GFX10-DL: ; %bb.0: ; %entry 1927; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1928; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1929; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1930; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1931; GFX10-DL-NEXT: s_clause 0x1 1932; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 1933; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 1934; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 1935; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 1936; GFX10-DL-NEXT: v_ashrrev_i32_e32 v0, 16, v1 1937; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1938; GFX10-DL-NEXT: v_ashrrev_i32_e32 v3, 16, v2 1939; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1940; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 1941; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1942; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s2 1943; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v0 1944; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] 1945; GFX10-DL-NEXT: s_endpgm 1946 <2 x i16> addrspace(1)* %src2, 1947 i32 addrspace(1)* nocapture %dst) { 1948entry: 1949 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1950 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 1951 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 1952 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 1953 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 1954 1955 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 1956 %conv = sext i16 %s1.elt1 to i32 1957 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 1958 %conv2 = sext i16 %s2.elt1 to i32 1959 %mul1 = mul i32 %conv2, %conv 1960 1961 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 1962 %conv3 = sext i16 %s1.elt2 to i32 1963 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 1964 %conv4 = sext i16 %s2.elt2 to i32 1965 %mul2 = mul i32 %conv4, %conv3 1966 1967 %s3 = load i32, i32 addrspace(1)* %dst, align 4 1968 %add1 = add i32 %mul2, %s3 1969 %add2 = add i32 %add1, %mul1 1970 1971 %res = add i32 %add2, %add1 1972 store i32 %res, i32 addrspace(1)* %dst, align 4 1973 ret void 1974} 1975 1976define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1, 1977; GFX7-LABEL: udot2_MultipleUses_mul1: 1978; GFX7: ; %bb.0: ; %entry 1979; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1980; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1981; GFX7-NEXT: s_mov_b32 s3, 0xf000 1982; GFX7-NEXT: s_mov_b32 s10, 0 1983; GFX7-NEXT: s_mov_b32 s11, s3 1984; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1985; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 1986; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1987; GFX7-NEXT: v_mov_b32_e32 v1, 0 1988; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 1989; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 1990; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1991; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 1992; GFX7-NEXT: s_mov_b32 s2, -1 1993; GFX7-NEXT: s_waitcnt vmcnt(1) 1994; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 1995; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 1996; GFX7-NEXT: s_waitcnt vmcnt(0) 1997; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 1998; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 1999; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2000; GFX7-NEXT: v_mad_u32_u24 v4, v0, v2, s4 2001; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4 2002; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 2003; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 2004; GFX7-NEXT: s_endpgm 2005; 2006; GFX8-LABEL: udot2_MultipleUses_mul1: 2007; GFX8: ; %bb.0: ; %entry 2008; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2009; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2010; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2011; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2012; GFX8-NEXT: v_mov_b32_e32 v1, s5 2013; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 2014; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2015; GFX8-NEXT: flat_load_dword v3, v[0:1] 2016; GFX8-NEXT: v_mov_b32_e32 v1, s7 2017; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 2018; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2019; GFX8-NEXT: flat_load_dword v0, v[0:1] 2020; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 2021; GFX8-NEXT: s_waitcnt vmcnt(1) 2022; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 2023; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 2024; GFX8-NEXT: s_waitcnt vmcnt(0) 2025; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 2026; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2027; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2028; GFX8-NEXT: v_mad_u32_u24 v4, v2, v1, s2 2029; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4 2030; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 2031; GFX8-NEXT: v_mov_b32_e32 v0, s0 2032; GFX8-NEXT: v_mov_b32_e32 v1, s1 2033; GFX8-NEXT: flat_store_dword v[0:1], v2 2034; GFX8-NEXT: s_endpgm 2035; 2036; GFX9-NODL-LABEL: udot2_MultipleUses_mul1: 2037; GFX9-NODL: ; %bb.0: ; %entry 2038; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2039; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2040; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2041; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2042; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 2043; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 2044; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 2045; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2046; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 2047; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v1 2048; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2049; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xffff, v2 2050; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2051; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v2, v4, v3 2052; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2053; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v3, s0 2054; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v2 2055; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 2056; GFX9-NODL-NEXT: s_endpgm 2057; 2058; GFX9-DL-LABEL: udot2_MultipleUses_mul1: 2059; GFX9-DL: ; %bb.0: ; %entry 2060; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2061; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2062; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2063; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2064; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 2065; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 2066; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 2067; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2068; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2069; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xffff, v1 2070; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2071; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xffff, v2 2072; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2073; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, v4, v3 2074; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2075; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, v3, s0 2076; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v2 2077; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 2078; GFX9-DL-NEXT: s_endpgm 2079; 2080; GFX10-DL-LABEL: udot2_MultipleUses_mul1: 2081; GFX10-DL: ; %bb.0: ; %entry 2082; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2083; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2084; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2085; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2086; GFX10-DL-NEXT: s_clause 0x1 2087; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 2088; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 2089; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 2090; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2091; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xffff, v1 2092; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2093; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xffff, v2 2094; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2095; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v0 2096; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2097; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s2 2098; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 2099; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, v2 2100; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] 2101; GFX10-DL-NEXT: s_endpgm 2102 <2 x i16> addrspace(1)* %src2, 2103 i32 addrspace(1)* nocapture %dst) { 2104entry: 2105 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2106 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 2107 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 2108 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 2109 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 2110 2111 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 2112 %conv = zext i16 %s1.elt1 to i32 2113 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 2114 %conv2 = zext i16 %s2.elt1 to i32 2115 %mul1 = mul i32 %conv2, %conv 2116 2117 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 2118 %conv3 = zext i16 %s1.elt2 to i32 2119 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 2120 %conv4 = zext i16 %s2.elt2 to i32 2121 %mul2 = mul i32 %conv4, %conv3 2122 2123 %s3 = load i32, i32 addrspace(1)* %dst, align 4 2124 %add0 = add i32 %mul1, %s3 2125 2126 %add1 = add i32 %mul2, %add0 2127 %add2 = add i32 %add1, %mul1 2128 2129 store i32 %add2, i32 addrspace(1)* %dst, align 4 2130 ret void 2131} 2132 2133define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1, 2134; GFX7-LABEL: idot2_MultipleUses_mul1: 2135; GFX7: ; %bb.0: ; %entry 2136; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2137; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2138; GFX7-NEXT: s_mov_b32 s3, 0xf000 2139; GFX7-NEXT: s_mov_b32 s10, 0 2140; GFX7-NEXT: s_mov_b32 s11, s3 2141; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2142; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 2143; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2144; GFX7-NEXT: v_mov_b32_e32 v1, 0 2145; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 2146; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 2147; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2148; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 2149; GFX7-NEXT: s_mov_b32 s2, -1 2150; GFX7-NEXT: s_waitcnt vmcnt(1) 2151; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16 2152; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 2153; GFX7-NEXT: s_waitcnt vmcnt(0) 2154; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16 2155; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2156; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2157; GFX7-NEXT: v_mad_i32_i24 v4, v3, v1, s4 2158; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v4 2159; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0 2160; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 2161; GFX7-NEXT: s_endpgm 2162; 2163; GFX8-LABEL: idot2_MultipleUses_mul1: 2164; GFX8: ; %bb.0: ; %entry 2165; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2166; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2167; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2168; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2169; GFX8-NEXT: v_mov_b32_e32 v1, s5 2170; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 2171; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2172; GFX8-NEXT: flat_load_dword v3, v[0:1] 2173; GFX8-NEXT: v_mov_b32_e32 v1, s7 2174; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 2175; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2176; GFX8-NEXT: flat_load_dword v0, v[0:1] 2177; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 2178; GFX8-NEXT: s_waitcnt vmcnt(1) 2179; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 2180; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 2181; GFX8-NEXT: s_waitcnt vmcnt(0) 2182; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 2183; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2184; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2185; GFX8-NEXT: v_mad_i32_i24 v4, v2, v1, s2 2186; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v4 2187; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 2188; GFX8-NEXT: v_mov_b32_e32 v0, s0 2189; GFX8-NEXT: v_mov_b32_e32 v1, s1 2190; GFX8-NEXT: flat_store_dword v[0:1], v2 2191; GFX8-NEXT: s_endpgm 2192; 2193; GFX9-NODL-LABEL: idot2_MultipleUses_mul1: 2194; GFX9-NODL: ; %bb.0: ; %entry 2195; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2196; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2197; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2198; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2199; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 2200; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 2201; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 2202; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2203; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 2204; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 16 2205; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2206; GFX9-NODL-NEXT: v_bfe_i32 v4, v2, 0, 16 2207; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2208; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v2, v4, v3 2209; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2210; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v4, v3, s0 2211; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v2 2212; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 2213; GFX9-NODL-NEXT: s_endpgm 2214; 2215; GFX9-DL-LABEL: idot2_MultipleUses_mul1: 2216; GFX9-DL: ; %bb.0: ; %entry 2217; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2218; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2219; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2220; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2221; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 2222; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 2223; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 2224; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2225; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2226; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 16 2227; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2228; GFX9-DL-NEXT: v_bfe_i32 v4, v2, 0, 16 2229; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2230; GFX9-DL-NEXT: v_mul_i32_i24_e32 v2, v4, v3 2231; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2232; GFX9-DL-NEXT: v_mad_i32_i24 v3, v4, v3, s0 2233; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v2 2234; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 2235; GFX9-DL-NEXT: s_endpgm 2236; 2237; GFX10-DL-LABEL: idot2_MultipleUses_mul1: 2238; GFX10-DL: ; %bb.0: ; %entry 2239; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2240; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2241; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2242; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2243; GFX10-DL-NEXT: s_clause 0x1 2244; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 2245; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 2246; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 2247; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2248; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 16 2249; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2250; GFX10-DL-NEXT: v_bfe_i32 v3, v2, 0, 16 2251; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2252; GFX10-DL-NEXT: v_mul_i32_i24_e32 v2, v3, v0 2253; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2254; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s2 2255; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 2256; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, v2 2257; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] 2258; GFX10-DL-NEXT: s_endpgm 2259 <2 x i16> addrspace(1)* %src2, 2260 i32 addrspace(1)* nocapture %dst) { 2261entry: 2262 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2263 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 2264 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 2265 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 2266 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 2267 2268 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 2269 %conv = sext i16 %s1.elt1 to i32 2270 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 2271 %conv2 = sext i16 %s2.elt1 to i32 2272 %mul1 = mul i32 %conv2, %conv 2273 2274 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 2275 %conv3 = sext i16 %s1.elt2 to i32 2276 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 2277 %conv4 = sext i16 %s2.elt2 to i32 2278 %mul2 = mul i32 %conv4, %conv3 2279 2280 %s3 = load i32, i32 addrspace(1)* %dst, align 4 2281 %add0 = add i32 %mul1, %s3 2282 2283 %add1 = add i32 %mul2, %add0 2284 %add2 = add i32 %add1, %mul1 2285 2286 store i32 %add2, i32 addrspace(1)* %dst, align 4 2287 ret void 2288} 2289 2290define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1, 2291; GFX7-LABEL: udot2_MultipleUses_mul2: 2292; GFX7: ; %bb.0: ; %entry 2293; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2294; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2295; GFX7-NEXT: s_mov_b32 s3, 0xf000 2296; GFX7-NEXT: s_mov_b32 s10, 0 2297; GFX7-NEXT: s_mov_b32 s11, s3 2298; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2299; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 2300; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2301; GFX7-NEXT: v_mov_b32_e32 v1, 0 2302; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 2303; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 2304; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2305; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 2306; GFX7-NEXT: s_mov_b32 s2, -1 2307; GFX7-NEXT: s_waitcnt vmcnt(1) 2308; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 2309; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 2310; GFX7-NEXT: s_waitcnt vmcnt(0) 2311; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 2312; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2313; GFX7-NEXT: v_mad_u32_u24 v4, v3, v1, s4 2314; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 2315; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4 2316; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 2317; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 2318; GFX7-NEXT: s_endpgm 2319; 2320; GFX8-LABEL: udot2_MultipleUses_mul2: 2321; GFX8: ; %bb.0: ; %entry 2322; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2323; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2324; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2325; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2326; GFX8-NEXT: v_mov_b32_e32 v1, s5 2327; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 2328; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2329; GFX8-NEXT: flat_load_dword v3, v[0:1] 2330; GFX8-NEXT: v_mov_b32_e32 v1, s7 2331; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 2332; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2333; GFX8-NEXT: flat_load_dword v0, v[0:1] 2334; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 2335; GFX8-NEXT: s_waitcnt vmcnt(1) 2336; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 2337; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 2338; GFX8-NEXT: s_waitcnt vmcnt(0) 2339; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 2340; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2341; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2342; GFX8-NEXT: v_mad_u32_u24 v4, v0, v3, s2 2343; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4 2344; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 2345; GFX8-NEXT: v_mov_b32_e32 v0, s0 2346; GFX8-NEXT: v_mov_b32_e32 v1, s1 2347; GFX8-NEXT: flat_store_dword v[0:1], v2 2348; GFX8-NEXT: s_endpgm 2349; 2350; GFX9-NODL-LABEL: udot2_MultipleUses_mul2: 2351; GFX9-NODL: ; %bb.0: ; %entry 2352; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2353; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2354; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2355; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2356; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 2357; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 2358; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 2359; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2360; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2361; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 2362; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2363; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 2364; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v4, v2, v1 2365; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2366; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 2367; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3 2368; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 2369; GFX9-NODL-NEXT: s_endpgm 2370; 2371; GFX9-DL-LABEL: udot2_MultipleUses_mul2: 2372; GFX9-DL: ; %bb.0: ; %entry 2373; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2374; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2375; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2376; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2377; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 2378; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 2379; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 2380; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2381; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2382; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 2383; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2384; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 2385; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v2, v1 2386; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2387; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 2388; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3 2389; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 2390; GFX9-DL-NEXT: s_endpgm 2391; 2392; GFX10-DL-LABEL: udot2_MultipleUses_mul2: 2393; GFX10-DL: ; %bb.0: ; %entry 2394; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2395; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2396; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2397; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2398; GFX10-DL-NEXT: s_clause 0x1 2399; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 2400; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 2401; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 2402; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2403; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v1 2404; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2405; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 2406; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 2407; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v0 2408; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2409; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s2 2410; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 2411; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1 2412; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] 2413; GFX10-DL-NEXT: s_endpgm 2414 <2 x i16> addrspace(1)* %src2, 2415 i32 addrspace(1)* nocapture %dst) { 2416entry: 2417 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2418 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 2419 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 2420 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 2421 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 2422 2423 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 2424 %conv = zext i16 %s1.elt1 to i32 2425 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 2426 %conv2 = zext i16 %s2.elt1 to i32 2427 %mul1 = mul i32 %conv2, %conv 2428 2429 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 2430 %conv3 = zext i16 %s1.elt2 to i32 2431 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 2432 %conv4 = zext i16 %s2.elt2 to i32 2433 %mul2 = mul i32 %conv4, %conv3 2434 2435 %s3 = load i32, i32 addrspace(1)* %dst, align 4 2436 %add0 = add i32 %mul2, %s3 2437 2438 %add1 = add i32 %mul2, %add0 2439 %add2 = add i32 %add1, %mul1 2440 2441 store i32 %add2, i32 addrspace(1)* %dst, align 4 2442 ret void 2443} 2444 2445define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1, 2446; GFX7-LABEL: idot2_MultipleUses_mul2: 2447; GFX7: ; %bb.0: ; %entry 2448; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2449; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2450; GFX7-NEXT: s_mov_b32 s3, 0xf000 2451; GFX7-NEXT: s_mov_b32 s10, 0 2452; GFX7-NEXT: s_mov_b32 s11, s3 2453; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2454; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 2455; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2456; GFX7-NEXT: v_mov_b32_e32 v1, 0 2457; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 2458; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 2459; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2460; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 2461; GFX7-NEXT: s_mov_b32 s2, -1 2462; GFX7-NEXT: s_waitcnt vmcnt(1) 2463; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16 2464; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 2465; GFX7-NEXT: s_waitcnt vmcnt(0) 2466; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16 2467; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2468; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2469; GFX7-NEXT: v_mad_i32_i24 v4, v0, v2, s4 2470; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v4 2471; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0 2472; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 2473; GFX7-NEXT: s_endpgm 2474; 2475; GFX8-LABEL: idot2_MultipleUses_mul2: 2476; GFX8: ; %bb.0: ; %entry 2477; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2478; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2479; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2480; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2481; GFX8-NEXT: v_mov_b32_e32 v1, s5 2482; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 2483; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2484; GFX8-NEXT: flat_load_dword v3, v[0:1] 2485; GFX8-NEXT: v_mov_b32_e32 v1, s7 2486; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 2487; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2488; GFX8-NEXT: flat_load_dword v0, v[0:1] 2489; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 2490; GFX8-NEXT: s_waitcnt vmcnt(1) 2491; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 2492; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 2493; GFX8-NEXT: s_waitcnt vmcnt(0) 2494; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 2495; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2496; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2497; GFX8-NEXT: v_mad_i32_i24 v4, v0, v3, s2 2498; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v4 2499; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 2500; GFX8-NEXT: v_mov_b32_e32 v0, s0 2501; GFX8-NEXT: v_mov_b32_e32 v1, s1 2502; GFX8-NEXT: flat_store_dword v[0:1], v2 2503; GFX8-NEXT: s_endpgm 2504; 2505; GFX9-NODL-LABEL: idot2_MultipleUses_mul2: 2506; GFX9-NODL: ; %bb.0: ; %entry 2507; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2508; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2509; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2510; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2511; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] 2512; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] 2513; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 2514; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2515; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2516; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 2517; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1 2518; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 2519; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v4, v2, v1 2520; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2521; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 2522; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3 2523; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 2524; GFX9-NODL-NEXT: s_endpgm 2525; 2526; GFX9-DL-LABEL: idot2_MultipleUses_mul2: 2527; GFX9-DL: ; %bb.0: ; %entry 2528; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2529; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2530; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2531; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2532; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] 2533; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] 2534; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 2535; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2536; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2537; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 2538; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1 2539; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 2540; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, v2, v1 2541; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2542; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 2543; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3 2544; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 2545; GFX9-DL-NEXT: s_endpgm 2546; 2547; GFX10-DL-LABEL: idot2_MultipleUses_mul2: 2548; GFX10-DL: ; %bb.0: ; %entry 2549; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2550; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2551; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2552; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2553; GFX10-DL-NEXT: s_clause 0x1 2554; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] 2555; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] 2556; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 2557; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2558; GFX10-DL-NEXT: v_ashrrev_i32_e32 v0, 16, v1 2559; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2560; GFX10-DL-NEXT: v_ashrrev_i32_e32 v3, 16, v2 2561; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 2562; GFX10-DL-NEXT: v_mul_i32_i24_e32 v2, v3, v0 2563; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2564; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s2 2565; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 2566; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1 2567; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] 2568; GFX10-DL-NEXT: s_endpgm 2569 <2 x i16> addrspace(1)* %src2, 2570 i32 addrspace(1)* nocapture %dst) { 2571entry: 2572 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2573 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 2574 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 2575 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 2576 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 2577 2578 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 2579 %conv = sext i16 %s1.elt1 to i32 2580 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 2581 %conv2 = sext i16 %s2.elt1 to i32 2582 %mul1 = mul i32 %conv2, %conv 2583 2584 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 2585 %conv3 = sext i16 %s1.elt2 to i32 2586 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 2587 %conv4 = sext i16 %s2.elt2 to i32 2588 %mul2 = mul i32 %conv4, %conv3 2589 2590 %s3 = load i32, i32 addrspace(1)* %dst, align 4 2591 %add0 = add i32 %mul2, %s3 2592 2593 %add1 = add i32 %mul2, %add0 2594 %add2 = add i32 %add1, %mul1 2595 2596 store i32 %add2, i32 addrspace(1)* %dst, align 4 2597 ret void 2598} 2599 2600define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1, 2601; GFX7-LABEL: udot2_acc16: 2602; GFX7: ; %bb.0: ; %entry 2603; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2604; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2605; GFX7-NEXT: s_mov_b32 s3, 0xf000 2606; GFX7-NEXT: s_mov_b32 s10, 0 2607; GFX7-NEXT: s_mov_b32 s11, s3 2608; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2609; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 2610; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2611; GFX7-NEXT: v_mov_b32_e32 v1, 0 2612; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 2613; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 2614; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2615; GFX7-NEXT: s_mov_b32 s2, -1 2616; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 2617; GFX7-NEXT: s_waitcnt vmcnt(2) 2618; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 2619; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 2620; GFX7-NEXT: s_waitcnt vmcnt(1) 2621; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 2622; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 2623; GFX7-NEXT: s_waitcnt vmcnt(0) 2624; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1 2625; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 2626; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 2627; GFX7-NEXT: s_endpgm 2628; 2629; GFX8-LABEL: udot2_acc16: 2630; GFX8: ; %bb.0: ; %entry 2631; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2632; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2633; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2634; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2635; GFX8-NEXT: v_mov_b32_e32 v1, s5 2636; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 2637; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2638; GFX8-NEXT: flat_load_dword v3, v[0:1] 2639; GFX8-NEXT: v_mov_b32_e32 v1, s7 2640; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 2641; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2642; GFX8-NEXT: flat_load_dword v2, v[0:1] 2643; GFX8-NEXT: v_mov_b32_e32 v0, s0 2644; GFX8-NEXT: v_mov_b32_e32 v1, s1 2645; GFX8-NEXT: flat_load_ushort v4, v[0:1] 2646; GFX8-NEXT: s_waitcnt vmcnt(2) 2647; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 2648; GFX8-NEXT: s_waitcnt vmcnt(1) 2649; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 2650; GFX8-NEXT: s_waitcnt vmcnt(0) 2651; GFX8-NEXT: v_mad_u16 v4, v5, v6, v4 2652; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 2653; GFX8-NEXT: flat_store_short v[0:1], v2 2654; GFX8-NEXT: s_endpgm 2655; 2656; GFX9-NODL-LABEL: udot2_acc16: 2657; GFX9-NODL: ; %bb.0: ; %entry 2658; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2659; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2660; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2661; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0 2662; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2663; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5] 2664; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7] 2665; GFX9-NODL-NEXT: global_load_ushort v4, v1, s[2:3] 2666; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 2667; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 2668; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 2669; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 2670; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2671; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4 2672; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0 2673; GFX9-NODL-NEXT: global_store_short v1, v0, s[2:3] 2674; GFX9-NODL-NEXT: s_endpgm 2675; 2676; GFX9-DL-LABEL: udot2_acc16: 2677; GFX9-DL: ; %bb.0: ; %entry 2678; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2679; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2680; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2681; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 2682; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2683; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] 2684; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] 2685; GFX9-DL-NEXT: global_load_ushort v4, v1, s[2:3] 2686; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 2687; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 2688; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2689; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 2690; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2691; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4 2692; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0 2693; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3] 2694; GFX9-DL-NEXT: s_endpgm 2695; 2696; GFX10-DL-LABEL: udot2_acc16: 2697; GFX10-DL: ; %bb.0: ; %entry 2698; GFX10-DL-NEXT: s_clause 0x1 2699; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2700; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2701; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2702; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 2703; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2704; GFX10-DL-NEXT: s_clause 0x1 2705; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] 2706; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] 2707; GFX10-DL-NEXT: global_load_ushort v4, v1, s[2:3] 2708; GFX10-DL-NEXT: s_waitcnt vmcnt(2) 2709; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 2710; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2711; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 2712; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2713; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 2714; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 2715; GFX10-DL-NEXT: global_store_short v1, v0, s[2:3] 2716; GFX10-DL-NEXT: s_endpgm 2717 <2 x i16> addrspace(1)* %src2, 2718 i16 addrspace(1)* nocapture %dst) { 2719entry: 2720 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2721 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx 2722 %v1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1 2723 %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx 2724 %v2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2 2725 2726 %v1e1 = extractelement <2 x i16> %v1, i64 0 2727 %v2e1 = extractelement <2 x i16> %v2, i64 0 2728 %mul1 = mul i16 %v1e1, %v2e1 2729 2730 %v1e2 = extractelement <2 x i16> %v1, i64 1 2731 %v2e2 = extractelement <2 x i16> %v2, i64 1 2732 %mul2 = mul i16 %v1e2, %v2e2 2733 2734 %s2 = load i16, i16 addrspace(1)* %dst, align 2 2735 %add1 = add i16 %mul2, %s2 2736 %add2 = add i16 %add1, %mul1 2737 store i16 %add2, i16 addrspace(1)* %dst, align 2 2738 ret void 2739} 2740 2741define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1, 2742; GFX7-LABEL: notsdot2_sext8: 2743; GFX7: ; %bb.0: ; %entry 2744; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2745; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2746; GFX7-NEXT: s_mov_b32 s3, 0xf000 2747; GFX7-NEXT: s_mov_b32 s10, 0 2748; GFX7-NEXT: s_mov_b32 s11, s3 2749; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2750; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] 2751; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0 2752; GFX7-NEXT: v_mov_b32_e32 v1, 0 2753; GFX7-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 2754; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] 2755; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 2756; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 2757; GFX7-NEXT: s_mov_b32 s2, -1 2758; GFX7-NEXT: s_waitcnt vmcnt(1) 2759; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 2760; GFX7-NEXT: v_bfe_i32 v2, v2, 8, 8 2761; GFX7-NEXT: s_waitcnt vmcnt(0) 2762; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 8 2763; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8 2764; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2765; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s4 2766; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0 2767; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 2768; GFX7-NEXT: s_endpgm 2769; 2770; GFX8-LABEL: notsdot2_sext8: 2771; GFX8: ; %bb.0: ; %entry 2772; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2773; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2774; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 2775; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2776; GFX8-NEXT: v_mov_b32_e32 v1, s5 2777; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 2778; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2779; GFX8-NEXT: flat_load_ushort v3, v[0:1] 2780; GFX8-NEXT: v_mov_b32_e32 v1, s7 2781; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 2782; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2783; GFX8-NEXT: flat_load_ushort v0, v[0:1] 2784; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 2785; GFX8-NEXT: s_waitcnt vmcnt(1) 2786; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 2787; GFX8-NEXT: v_lshrrev_b16_e32 v3, 8, v3 2788; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 2789; GFX8-NEXT: s_waitcnt vmcnt(0) 2790; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8 2791; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 2792; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 2793; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2794; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s2 2795; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 2796; GFX8-NEXT: v_mov_b32_e32 v0, s0 2797; GFX8-NEXT: v_mov_b32_e32 v1, s1 2798; GFX8-NEXT: flat_store_dword v[0:1], v2 2799; GFX8-NEXT: s_endpgm 2800; 2801; GFX9-NODL-LABEL: notsdot2_sext8: 2802; GFX9-NODL: ; %bb.0: ; %entry 2803; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2804; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2805; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 2806; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2807; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[4:5] 2808; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[6:7] 2809; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 2810; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2811; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2812; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 2813; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v1, 8, v1 2814; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 2815; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 2816; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2817; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 2818; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] 2819; GFX9-NODL-NEXT: s_endpgm 2820; 2821; GFX9-DL-LABEL: notsdot2_sext8: 2822; GFX9-DL: ; %bb.0: ; %entry 2823; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2824; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2825; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 2826; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2827; GFX9-DL-NEXT: global_load_ushort v1, v0, s[4:5] 2828; GFX9-DL-NEXT: global_load_ushort v2, v0, s[6:7] 2829; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 2830; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2831; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2832; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 2833; GFX9-DL-NEXT: v_lshrrev_b16_e32 v1, 8, v1 2834; GFX9-DL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 2835; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 2836; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2837; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3 2838; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] 2839; GFX9-DL-NEXT: s_endpgm 2840; 2841; GFX10-DL-LABEL: notsdot2_sext8: 2842; GFX10-DL: ; %bb.0: ; %entry 2843; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2844; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 2845; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2846; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2847; GFX10-DL-NEXT: s_clause 0x1 2848; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] 2849; GFX10-DL-NEXT: global_load_ushort v2, v0, s[6:7] 2850; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 2851; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2852; GFX10-DL-NEXT: v_lshrrev_b16 v0, 8, v1 2853; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2854; GFX10-DL-NEXT: v_lshrrev_b16 v3, 8, v2 2855; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 2856; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 2857; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v3), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 2858; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2859; GFX10-DL-NEXT: v_add3_u32 v0, v0, s2, v1 2860; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] 2861; GFX10-DL-NEXT: s_endpgm 2862 <2 x i8> addrspace(1)* %src2, 2863 i32 addrspace(1)* nocapture %dst) { 2864entry: 2865 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2866 %gep1 = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %src1, i32 %idx 2867 %vec1 = load <2 x i8>, <2 x i8> addrspace(1)* %gep1 2868 %gep2 = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %src2, i32 %idx 2869 %vec2 = load <2 x i8>, <2 x i8> addrspace(1)* %gep2 2870 2871 %s1.elt1 = extractelement <2 x i8> %vec1, i64 0 2872 %conv = sext i8 %s1.elt1 to i32 2873 %s2.elt1 = extractelement <2 x i8> %vec2, i64 0 2874 %conv2 = sext i8 %s2.elt1 to i32 2875 %mul1 = mul nuw i32 %conv2, %conv 2876 2877 %s1.elt2 = extractelement <2 x i8> %vec1, i64 1 2878 %conv3 = sext i8 %s1.elt2 to i32 2879 %s2.elt2 = extractelement <2 x i8> %vec2, i64 1 2880 %conv4 = sext i8 %s2.elt2 to i32 2881 %mul2 = mul nuw i32 %conv4, %conv3 2882 2883 %s3 = load i32, i32 addrspace(1)* %dst, align 4 2884 %add = add i32 %mul2, %s3 2885 %add6 = add i32 %add, %mul1 2886 store i32 %add6, i32 addrspace(1)* %dst, align 4 2887 ret void 2888} 2889 2890declare i32 @llvm.amdgcn.workitem.id.x() 2891