1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -verify-machineinstrs < %s | FileCheck %s 3 4declare { <vscale x 2 x i8>, <vscale x 2 x i1> } @llvm.umul.with.overflow.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>) 5 6define <vscale x 2 x i8> @umulo_nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y) { 7; CHECK-LABEL: umulo_nxv2i8: 8; CHECK: // %bb.0: 9; CHECK-NEXT: ptrue p0.d 10; CHECK-NEXT: and z1.d, z1.d, #0xff 11; CHECK-NEXT: and z0.d, z0.d, #0xff 12; CHECK-NEXT: movprfx z2, z0 13; CHECK-NEXT: mul z2.d, p0/m, z2.d, z1.d 14; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d 15; CHECK-NEXT: lsr z1.d, z2.d, #8 16; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 17; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 18; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b 19; CHECK-NEXT: mov z2.d, p0/m, #0 // =0x0 20; CHECK-NEXT: mov z0.d, z2.d 21; CHECK-NEXT: ret 22 %a = call { <vscale x 2 x i8>, <vscale x 2 x i1> } @llvm.umul.with.overflow.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y) 23 %b = extractvalue { <vscale x 2 x i8>, <vscale x 2 x i1> } %a, 0 24 %c = extractvalue { <vscale x 2 x i8>, <vscale x 2 x i1> } %a, 1 25 %d = select <vscale x 2 x i1> %c, <vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> %b 26 ret <vscale x 2 x i8> %d 27} 28 29declare { <vscale x 4 x i8>, <vscale x 4 x i1> } @llvm.umul.with.overflow.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>) 30 31define <vscale x 4 x i8> @umulo_nxv4i8(<vscale x 4 x i8> %x, <vscale x 4 x i8> %y) { 32; CHECK-LABEL: umulo_nxv4i8: 33; CHECK: // %bb.0: 34; CHECK-NEXT: ptrue p0.s 35; CHECK-NEXT: and z1.s, z1.s, #0xff 36; CHECK-NEXT: and z0.s, z0.s, #0xff 37; CHECK-NEXT: movprfx z2, z0 38; CHECK-NEXT: mul z2.s, p0/m, z2.s, z1.s 39; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s 40; CHECK-NEXT: lsr z1.s, z2.s, #8 41; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0 42; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 43; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b 44; CHECK-NEXT: mov z2.s, p0/m, #0 // =0x0 45; CHECK-NEXT: mov z0.d, z2.d 46; CHECK-NEXT: ret 47 %a = call { <vscale x 4 x i8>, <vscale x 4 x i1> } @llvm.umul.with.overflow.nxv4i8(<vscale x 4 x i8> %x, <vscale x 4 x i8> %y) 48 %b = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i1> } %a, 0 49 %c = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i1> } %a, 1 50 %d = select <vscale x 4 x i1> %c, <vscale x 4 x i8> zeroinitializer, <vscale x 4 x i8> %b 51 ret <vscale x 4 x i8> %d 52} 53 54declare { <vscale x 8 x i8>, <vscale x 8 x i1> } @llvm.umul.with.overflow.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>) 55 56define <vscale x 8 x i8> @umulo_nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8> %y) { 57; CHECK-LABEL: umulo_nxv8i8: 58; CHECK: // %bb.0: 59; CHECK-NEXT: ptrue p0.h 60; CHECK-NEXT: and z1.h, z1.h, #0xff 61; CHECK-NEXT: and z0.h, z0.h, #0xff 62; CHECK-NEXT: movprfx z2, z0 63; CHECK-NEXT: mul z2.h, p0/m, z2.h, z1.h 64; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h 65; CHECK-NEXT: lsr z1.h, z2.h, #8 66; CHECK-NEXT: cmpne p1.h, p0/z, z0.h, #0 67; CHECK-NEXT: cmpne p0.h, p0/z, z1.h, #0 68; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b 69; CHECK-NEXT: mov z2.h, p0/m, #0 // =0x0 70; CHECK-NEXT: mov z0.d, z2.d 71; CHECK-NEXT: ret 72 %a = call { <vscale x 8 x i8>, <vscale x 8 x i1> } @llvm.umul.with.overflow.nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8> %y) 73 %b = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i1> } %a, 0 74 %c = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i1> } %a, 1 75 %d = select <vscale x 8 x i1> %c, <vscale x 8 x i8> zeroinitializer, <vscale x 8 x i8> %b 76 ret <vscale x 8 x i8> %d 77} 78 79declare { <vscale x 16 x i8>, <vscale x 16 x i1> } @llvm.umul.with.overflow.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>) 80 81define <vscale x 16 x i8> @umulo_nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y) { 82; CHECK-LABEL: umulo_nxv16i8: 83; CHECK: // %bb.0: 84; CHECK-NEXT: ptrue p0.b 85; CHECK-NEXT: movprfx z2, z0 86; CHECK-NEXT: umulh z2.b, p0/m, z2.b, z1.b 87; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b 88; CHECK-NEXT: cmpne p0.b, p0/z, z2.b, #0 89; CHECK-NEXT: mov z0.b, p0/m, #0 // =0x0 90; CHECK-NEXT: ret 91 %a = call { <vscale x 16 x i8>, <vscale x 16 x i1> } @llvm.umul.with.overflow.nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y) 92 %b = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i1> } %a, 0 93 %c = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i1> } %a, 1 94 %d = select <vscale x 16 x i1> %c, <vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> %b 95 ret <vscale x 16 x i8> %d 96} 97 98declare { <vscale x 32 x i8>, <vscale x 32 x i1> } @llvm.umul.with.overflow.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>) 99 100define <vscale x 32 x i8> @umulo_nxv32i8(<vscale x 32 x i8> %x, <vscale x 32 x i8> %y) { 101; CHECK-LABEL: umulo_nxv32i8: 102; CHECK: // %bb.0: 103; CHECK-NEXT: ptrue p0.b 104; CHECK-NEXT: movprfx z4, z1 105; CHECK-NEXT: mul z4.b, p0/m, z4.b, z3.b 106; CHECK-NEXT: umulh z1.b, p0/m, z1.b, z3.b 107; CHECK-NEXT: movprfx z3, z0 108; CHECK-NEXT: umulh z3.b, p0/m, z3.b, z2.b 109; CHECK-NEXT: cmpne p1.b, p0/z, z1.b, #0 110; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b 111; CHECK-NEXT: cmpne p0.b, p0/z, z3.b, #0 112; CHECK-NEXT: mov z4.b, p1/m, #0 // =0x0 113; CHECK-NEXT: mov z0.b, p0/m, #0 // =0x0 114; CHECK-NEXT: mov z1.d, z4.d 115; CHECK-NEXT: ret 116 %a = call { <vscale x 32 x i8>, <vscale x 32 x i1> } @llvm.umul.with.overflow.nxv32i8(<vscale x 32 x i8> %x, <vscale x 32 x i8> %y) 117 %b = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i1> } %a, 0 118 %c = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i1> } %a, 1 119 %d = select <vscale x 32 x i1> %c, <vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> %b 120 ret <vscale x 32 x i8> %d 121} 122 123declare { <vscale x 64 x i8>, <vscale x 64 x i1> } @llvm.umul.with.overflow.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>) 124 125define <vscale x 64 x i8> @umulo_nxv64i8(<vscale x 64 x i8> %x, <vscale x 64 x i8> %y) { 126; CHECK-LABEL: umulo_nxv64i8: 127; CHECK: // %bb.0: 128; CHECK-NEXT: ptrue p0.b 129; CHECK-NEXT: movprfx z24, z3 130; CHECK-NEXT: mul z24.b, p0/m, z24.b, z7.b 131; CHECK-NEXT: umulh z3.b, p0/m, z3.b, z7.b 132; CHECK-NEXT: cmpne p1.b, p0/z, z3.b, #0 133; CHECK-NEXT: movprfx z3, z2 134; CHECK-NEXT: umulh z3.b, p0/m, z3.b, z6.b 135; CHECK-NEXT: cmpne p2.b, p0/z, z3.b, #0 136; CHECK-NEXT: movprfx z3, z1 137; CHECK-NEXT: mul z3.b, p0/m, z3.b, z5.b 138; CHECK-NEXT: umulh z1.b, p0/m, z1.b, z5.b 139; CHECK-NEXT: mul z2.b, p0/m, z2.b, z6.b 140; CHECK-NEXT: cmpne p3.b, p0/z, z1.b, #0 141; CHECK-NEXT: movprfx z1, z0 142; CHECK-NEXT: umulh z1.b, p0/m, z1.b, z4.b 143; CHECK-NEXT: mul z0.b, p0/m, z0.b, z4.b 144; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, #0 145; CHECK-NEXT: mov z3.b, p3/m, #0 // =0x0 146; CHECK-NEXT: mov z24.b, p1/m, #0 // =0x0 147; CHECK-NEXT: mov z0.b, p0/m, #0 // =0x0 148; CHECK-NEXT: mov z2.b, p2/m, #0 // =0x0 149; CHECK-NEXT: mov z1.d, z3.d 150; CHECK-NEXT: mov z3.d, z24.d 151; CHECK-NEXT: ret 152 %a = call { <vscale x 64 x i8>, <vscale x 64 x i1> } @llvm.umul.with.overflow.nxv64i8(<vscale x 64 x i8> %x, <vscale x 64 x i8> %y) 153 %b = extractvalue { <vscale x 64 x i8>, <vscale x 64 x i1> } %a, 0 154 %c = extractvalue { <vscale x 64 x i8>, <vscale x 64 x i1> } %a, 1 155 %d = select <vscale x 64 x i1> %c, <vscale x 64 x i8> zeroinitializer, <vscale x 64 x i8> %b 156 ret <vscale x 64 x i8> %d 157} 158 159declare { <vscale x 2 x i16>, <vscale x 2 x i1> } @llvm.umul.with.overflow.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>) 160 161define <vscale x 2 x i16> @umulo_nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x i16> %y) { 162; CHECK-LABEL: umulo_nxv2i16: 163; CHECK: // %bb.0: 164; CHECK-NEXT: ptrue p0.d 165; CHECK-NEXT: and z1.d, z1.d, #0xffff 166; CHECK-NEXT: and z0.d, z0.d, #0xffff 167; CHECK-NEXT: movprfx z2, z0 168; CHECK-NEXT: mul z2.d, p0/m, z2.d, z1.d 169; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d 170; CHECK-NEXT: lsr z1.d, z2.d, #16 171; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 172; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 173; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b 174; CHECK-NEXT: mov z2.d, p0/m, #0 // =0x0 175; CHECK-NEXT: mov z0.d, z2.d 176; CHECK-NEXT: ret 177 %a = call { <vscale x 2 x i16>, <vscale x 2 x i1> } @llvm.umul.with.overflow.nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x i16> %y) 178 %b = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i1> } %a, 0 179 %c = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i1> } %a, 1 180 %d = select <vscale x 2 x i1> %c, <vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> %b 181 ret <vscale x 2 x i16> %d 182} 183 184declare { <vscale x 4 x i16>, <vscale x 4 x i1> } @llvm.umul.with.overflow.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>) 185 186define <vscale x 4 x i16> @umulo_nxv4i16(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y) { 187; CHECK-LABEL: umulo_nxv4i16: 188; CHECK: // %bb.0: 189; CHECK-NEXT: ptrue p0.s 190; CHECK-NEXT: and z1.s, z1.s, #0xffff 191; CHECK-NEXT: and z0.s, z0.s, #0xffff 192; CHECK-NEXT: movprfx z2, z0 193; CHECK-NEXT: mul z2.s, p0/m, z2.s, z1.s 194; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s 195; CHECK-NEXT: lsr z1.s, z2.s, #16 196; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0 197; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 198; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b 199; CHECK-NEXT: mov z2.s, p0/m, #0 // =0x0 200; CHECK-NEXT: mov z0.d, z2.d 201; CHECK-NEXT: ret 202 %a = call { <vscale x 4 x i16>, <vscale x 4 x i1> } @llvm.umul.with.overflow.nxv4i16(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y) 203 %b = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i1> } %a, 0 204 %c = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i1> } %a, 1 205 %d = select <vscale x 4 x i1> %c, <vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> %b 206 ret <vscale x 4 x i16> %d 207} 208 209declare { <vscale x 8 x i16>, <vscale x 8 x i1> } @llvm.umul.with.overflow.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>) 210 211define <vscale x 8 x i16> @umulo_nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y) { 212; CHECK-LABEL: umulo_nxv8i16: 213; CHECK: // %bb.0: 214; CHECK-NEXT: ptrue p0.h 215; CHECK-NEXT: movprfx z2, z0 216; CHECK-NEXT: umulh z2.h, p0/m, z2.h, z1.h 217; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h 218; CHECK-NEXT: cmpne p0.h, p0/z, z2.h, #0 219; CHECK-NEXT: mov z0.h, p0/m, #0 // =0x0 220; CHECK-NEXT: ret 221 %a = call { <vscale x 8 x i16>, <vscale x 8 x i1> } @llvm.umul.with.overflow.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y) 222 %b = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i1> } %a, 0 223 %c = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i1> } %a, 1 224 %d = select <vscale x 8 x i1> %c, <vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> %b 225 ret <vscale x 8 x i16> %d 226} 227 228declare { <vscale x 16 x i16>, <vscale x 16 x i1> } @llvm.umul.with.overflow.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>) 229 230define <vscale x 16 x i16> @umulo_nxv16i16(<vscale x 16 x i16> %x, <vscale x 16 x i16> %y) { 231; CHECK-LABEL: umulo_nxv16i16: 232; CHECK: // %bb.0: 233; CHECK-NEXT: ptrue p0.h 234; CHECK-NEXT: movprfx z4, z1 235; CHECK-NEXT: mul z4.h, p0/m, z4.h, z3.h 236; CHECK-NEXT: umulh z1.h, p0/m, z1.h, z3.h 237; CHECK-NEXT: movprfx z3, z0 238; CHECK-NEXT: umulh z3.h, p0/m, z3.h, z2.h 239; CHECK-NEXT: cmpne p1.h, p0/z, z1.h, #0 240; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h 241; CHECK-NEXT: cmpne p0.h, p0/z, z3.h, #0 242; CHECK-NEXT: mov z4.h, p1/m, #0 // =0x0 243; CHECK-NEXT: mov z0.h, p0/m, #0 // =0x0 244; CHECK-NEXT: mov z1.d, z4.d 245; CHECK-NEXT: ret 246 %a = call { <vscale x 16 x i16>, <vscale x 16 x i1> } @llvm.umul.with.overflow.nxv16i16(<vscale x 16 x i16> %x, <vscale x 16 x i16> %y) 247 %b = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i1> } %a, 0 248 %c = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i1> } %a, 1 249 %d = select <vscale x 16 x i1> %c, <vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> %b 250 ret <vscale x 16 x i16> %d 251} 252 253declare { <vscale x 32 x i16>, <vscale x 32 x i1> } @llvm.umul.with.overflow.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>) 254 255define <vscale x 32 x i16> @umulo_nxv32i16(<vscale x 32 x i16> %x, <vscale x 32 x i16> %y) { 256; CHECK-LABEL: umulo_nxv32i16: 257; CHECK: // %bb.0: 258; CHECK-NEXT: ptrue p0.h 259; CHECK-NEXT: movprfx z24, z3 260; CHECK-NEXT: mul z24.h, p0/m, z24.h, z7.h 261; CHECK-NEXT: umulh z3.h, p0/m, z3.h, z7.h 262; CHECK-NEXT: cmpne p1.h, p0/z, z3.h, #0 263; CHECK-NEXT: movprfx z3, z2 264; CHECK-NEXT: umulh z3.h, p0/m, z3.h, z6.h 265; CHECK-NEXT: cmpne p2.h, p0/z, z3.h, #0 266; CHECK-NEXT: movprfx z3, z1 267; CHECK-NEXT: mul z3.h, p0/m, z3.h, z5.h 268; CHECK-NEXT: umulh z1.h, p0/m, z1.h, z5.h 269; CHECK-NEXT: mul z2.h, p0/m, z2.h, z6.h 270; CHECK-NEXT: cmpne p3.h, p0/z, z1.h, #0 271; CHECK-NEXT: movprfx z1, z0 272; CHECK-NEXT: umulh z1.h, p0/m, z1.h, z4.h 273; CHECK-NEXT: mul z0.h, p0/m, z0.h, z4.h 274; CHECK-NEXT: cmpne p0.h, p0/z, z1.h, #0 275; CHECK-NEXT: mov z3.h, p3/m, #0 // =0x0 276; CHECK-NEXT: mov z24.h, p1/m, #0 // =0x0 277; CHECK-NEXT: mov z0.h, p0/m, #0 // =0x0 278; CHECK-NEXT: mov z2.h, p2/m, #0 // =0x0 279; CHECK-NEXT: mov z1.d, z3.d 280; CHECK-NEXT: mov z3.d, z24.d 281; CHECK-NEXT: ret 282 %a = call { <vscale x 32 x i16>, <vscale x 32 x i1> } @llvm.umul.with.overflow.nxv32i16(<vscale x 32 x i16> %x, <vscale x 32 x i16> %y) 283 %b = extractvalue { <vscale x 32 x i16>, <vscale x 32 x i1> } %a, 0 284 %c = extractvalue { <vscale x 32 x i16>, <vscale x 32 x i1> } %a, 1 285 %d = select <vscale x 32 x i1> %c, <vscale x 32 x i16> zeroinitializer, <vscale x 32 x i16> %b 286 ret <vscale x 32 x i16> %d 287} 288 289declare { <vscale x 2 x i32>, <vscale x 2 x i1> } @llvm.umul.with.overflow.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>) 290 291define <vscale x 2 x i32> @umulo_nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i32> %y) { 292; CHECK-LABEL: umulo_nxv2i32: 293; CHECK: // %bb.0: 294; CHECK-NEXT: ptrue p0.d 295; CHECK-NEXT: and z1.d, z1.d, #0xffffffff 296; CHECK-NEXT: and z0.d, z0.d, #0xffffffff 297; CHECK-NEXT: movprfx z2, z0 298; CHECK-NEXT: mul z2.d, p0/m, z2.d, z1.d 299; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d 300; CHECK-NEXT: lsr z1.d, z2.d, #32 301; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 302; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 303; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b 304; CHECK-NEXT: mov z2.d, p0/m, #0 // =0x0 305; CHECK-NEXT: mov z0.d, z2.d 306; CHECK-NEXT: ret 307 %a = call { <vscale x 2 x i32>, <vscale x 2 x i1> } @llvm.umul.with.overflow.nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i32> %y) 308 %b = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i1> } %a, 0 309 %c = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i1> } %a, 1 310 %d = select <vscale x 2 x i1> %c, <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> %b 311 ret <vscale x 2 x i32> %d 312} 313 314declare { <vscale x 4 x i32>, <vscale x 4 x i1> } @llvm.umul.with.overflow.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>) 315 316define <vscale x 4 x i32> @umulo_nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y) { 317; CHECK-LABEL: umulo_nxv4i32: 318; CHECK: // %bb.0: 319; CHECK-NEXT: ptrue p0.s 320; CHECK-NEXT: movprfx z2, z0 321; CHECK-NEXT: umulh z2.s, p0/m, z2.s, z1.s 322; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s 323; CHECK-NEXT: cmpne p0.s, p0/z, z2.s, #0 324; CHECK-NEXT: mov z0.s, p0/m, #0 // =0x0 325; CHECK-NEXT: ret 326 %a = call { <vscale x 4 x i32>, <vscale x 4 x i1> } @llvm.umul.with.overflow.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y) 327 %b = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } %a, 0 328 %c = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } %a, 1 329 %d = select <vscale x 4 x i1> %c, <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> %b 330 ret <vscale x 4 x i32> %d 331} 332 333declare { <vscale x 8 x i32>, <vscale x 8 x i1> } @llvm.umul.with.overflow.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>) 334 335define <vscale x 8 x i32> @umulo_nxv8i32(<vscale x 8 x i32> %x, <vscale x 8 x i32> %y) { 336; CHECK-LABEL: umulo_nxv8i32: 337; CHECK: // %bb.0: 338; CHECK-NEXT: ptrue p0.s 339; CHECK-NEXT: movprfx z4, z1 340; CHECK-NEXT: mul z4.s, p0/m, z4.s, z3.s 341; CHECK-NEXT: umulh z1.s, p0/m, z1.s, z3.s 342; CHECK-NEXT: movprfx z3, z0 343; CHECK-NEXT: umulh z3.s, p0/m, z3.s, z2.s 344; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, #0 345; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s 346; CHECK-NEXT: cmpne p0.s, p0/z, z3.s, #0 347; CHECK-NEXT: mov z4.s, p1/m, #0 // =0x0 348; CHECK-NEXT: mov z0.s, p0/m, #0 // =0x0 349; CHECK-NEXT: mov z1.d, z4.d 350; CHECK-NEXT: ret 351 %a = call { <vscale x 8 x i32>, <vscale x 8 x i1> } @llvm.umul.with.overflow.nxv8i32(<vscale x 8 x i32> %x, <vscale x 8 x i32> %y) 352 %b = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i1> } %a, 0 353 %c = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i1> } %a, 1 354 %d = select <vscale x 8 x i1> %c, <vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> %b 355 ret <vscale x 8 x i32> %d 356} 357 358declare { <vscale x 16 x i32>, <vscale x 16 x i1> } @llvm.umul.with.overflow.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>) 359 360define <vscale x 16 x i32> @umulo_nxv16i32(<vscale x 16 x i32> %x, <vscale x 16 x i32> %y) { 361; CHECK-LABEL: umulo_nxv16i32: 362; CHECK: // %bb.0: 363; CHECK-NEXT: ptrue p0.s 364; CHECK-NEXT: movprfx z24, z3 365; CHECK-NEXT: mul z24.s, p0/m, z24.s, z7.s 366; CHECK-NEXT: umulh z3.s, p0/m, z3.s, z7.s 367; CHECK-NEXT: cmpne p1.s, p0/z, z3.s, #0 368; CHECK-NEXT: movprfx z3, z2 369; CHECK-NEXT: umulh z3.s, p0/m, z3.s, z6.s 370; CHECK-NEXT: cmpne p2.s, p0/z, z3.s, #0 371; CHECK-NEXT: movprfx z3, z1 372; CHECK-NEXT: mul z3.s, p0/m, z3.s, z5.s 373; CHECK-NEXT: umulh z1.s, p0/m, z1.s, z5.s 374; CHECK-NEXT: mul z2.s, p0/m, z2.s, z6.s 375; CHECK-NEXT: cmpne p3.s, p0/z, z1.s, #0 376; CHECK-NEXT: movprfx z1, z0 377; CHECK-NEXT: umulh z1.s, p0/m, z1.s, z4.s 378; CHECK-NEXT: mul z0.s, p0/m, z0.s, z4.s 379; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 380; CHECK-NEXT: mov z3.s, p3/m, #0 // =0x0 381; CHECK-NEXT: mov z24.s, p1/m, #0 // =0x0 382; CHECK-NEXT: mov z0.s, p0/m, #0 // =0x0 383; CHECK-NEXT: mov z2.s, p2/m, #0 // =0x0 384; CHECK-NEXT: mov z1.d, z3.d 385; CHECK-NEXT: mov z3.d, z24.d 386; CHECK-NEXT: ret 387 %a = call { <vscale x 16 x i32>, <vscale x 16 x i1> } @llvm.umul.with.overflow.nxv16i32(<vscale x 16 x i32> %x, <vscale x 16 x i32> %y) 388 %b = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i1> } %a, 0 389 %c = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i1> } %a, 1 390 %d = select <vscale x 16 x i1> %c, <vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> %b 391 ret <vscale x 16 x i32> %d 392} 393 394declare { <vscale x 2 x i64>, <vscale x 2 x i1> } @llvm.umul.with.overflow.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>) 395 396define <vscale x 2 x i64> @umulo_nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) { 397; CHECK-LABEL: umulo_nxv2i64: 398; CHECK: // %bb.0: 399; CHECK-NEXT: ptrue p0.d 400; CHECK-NEXT: movprfx z2, z0 401; CHECK-NEXT: umulh z2.d, p0/m, z2.d, z1.d 402; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d 403; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 404; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0 405; CHECK-NEXT: ret 406 %a = call { <vscale x 2 x i64>, <vscale x 2 x i1> } @llvm.umul.with.overflow.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) 407 %b = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i1> } %a, 0 408 %c = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i1> } %a, 1 409 %d = select <vscale x 2 x i1> %c, <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> %b 410 ret <vscale x 2 x i64> %d 411} 412 413declare { <vscale x 4 x i64>, <vscale x 4 x i1> } @llvm.umul.with.overflow.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>) 414 415define <vscale x 4 x i64> @umulo_nxv4i64(<vscale x 4 x i64> %x, <vscale x 4 x i64> %y) { 416; CHECK-LABEL: umulo_nxv4i64: 417; CHECK: // %bb.0: 418; CHECK-NEXT: ptrue p0.d 419; CHECK-NEXT: movprfx z4, z1 420; CHECK-NEXT: mul z4.d, p0/m, z4.d, z3.d 421; CHECK-NEXT: umulh z1.d, p0/m, z1.d, z3.d 422; CHECK-NEXT: movprfx z3, z0 423; CHECK-NEXT: umulh z3.d, p0/m, z3.d, z2.d 424; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0 425; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d 426; CHECK-NEXT: cmpne p0.d, p0/z, z3.d, #0 427; CHECK-NEXT: mov z4.d, p1/m, #0 // =0x0 428; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0 429; CHECK-NEXT: mov z1.d, z4.d 430; CHECK-NEXT: ret 431 %a = call { <vscale x 4 x i64>, <vscale x 4 x i1> } @llvm.umul.with.overflow.nxv4i64(<vscale x 4 x i64> %x, <vscale x 4 x i64> %y) 432 %b = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i1> } %a, 0 433 %c = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i1> } %a, 1 434 %d = select <vscale x 4 x i1> %c, <vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> %b 435 ret <vscale x 4 x i64> %d 436} 437 438declare { <vscale x 8 x i64>, <vscale x 8 x i1> } @llvm.umul.with.overflow.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>) 439 440define <vscale x 8 x i64> @umulo_nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i64> %y) { 441; CHECK-LABEL: umulo_nxv8i64: 442; CHECK: // %bb.0: 443; CHECK-NEXT: ptrue p0.d 444; CHECK-NEXT: movprfx z24, z3 445; CHECK-NEXT: mul z24.d, p0/m, z24.d, z7.d 446; CHECK-NEXT: umulh z3.d, p0/m, z3.d, z7.d 447; CHECK-NEXT: cmpne p1.d, p0/z, z3.d, #0 448; CHECK-NEXT: movprfx z3, z2 449; CHECK-NEXT: umulh z3.d, p0/m, z3.d, z6.d 450; CHECK-NEXT: cmpne p2.d, p0/z, z3.d, #0 451; CHECK-NEXT: movprfx z3, z1 452; CHECK-NEXT: mul z3.d, p0/m, z3.d, z5.d 453; CHECK-NEXT: umulh z1.d, p0/m, z1.d, z5.d 454; CHECK-NEXT: mul z2.d, p0/m, z2.d, z6.d 455; CHECK-NEXT: cmpne p3.d, p0/z, z1.d, #0 456; CHECK-NEXT: movprfx z1, z0 457; CHECK-NEXT: umulh z1.d, p0/m, z1.d, z4.d 458; CHECK-NEXT: mul z0.d, p0/m, z0.d, z4.d 459; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 460; CHECK-NEXT: mov z3.d, p3/m, #0 // =0x0 461; CHECK-NEXT: mov z24.d, p1/m, #0 // =0x0 462; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0 463; CHECK-NEXT: mov z2.d, p2/m, #0 // =0x0 464; CHECK-NEXT: mov z1.d, z3.d 465; CHECK-NEXT: mov z3.d, z24.d 466; CHECK-NEXT: ret 467 %a = call { <vscale x 8 x i64>, <vscale x 8 x i1> } @llvm.umul.with.overflow.nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i64> %y) 468 %b = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i1> } %a, 0 469 %c = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i1> } %a, 1 470 %d = select <vscale x 8 x i1> %c, <vscale x 8 x i64> zeroinitializer, <vscale x 8 x i64> %b 471 ret <vscale x 8 x i64> %d 472} 473