1; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s 2 3; CHECK-LABEL: exchange_1 4; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* 5; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] 6; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* 7; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] 8; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] 9define i32 @exchange_1(i16* %a, i16* %b, i32 %acc) { 10entry: 11 %addr.a.1 = getelementptr i16, i16* %a, i32 1 12 %addr.b.1 = getelementptr i16, i16* %b, i32 1 13 %ld.a.0 = load i16, i16* %a 14 %sext.a.0 = sext i16 %ld.a.0 to i32 15 %ld.b.0 = load i16, i16* %b 16 %ld.a.1 = load i16, i16* %addr.a.1 17 %ld.b.1 = load i16, i16* %addr.b.1 18 %sext.a.1 = sext i16 %ld.a.1 to i32 19 %sext.b.1 = sext i16 %ld.b.1 to i32 20 %sext.b.0 = sext i16 %ld.b.0 to i32 21 %mul.0 = mul i32 %sext.a.0, %sext.b.1 22 %mul.1 = mul i32 %sext.a.1, %sext.b.0 23 %add = add i32 %mul.0, %mul.1 24 %res = add i32 %add, %acc 25 ret i32 %res 26} 27 28; CHECK-LABEL: exchange_2 29; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* 30; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] 31; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* 32; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] 33; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] 34define i32 @exchange_2(i16* %a, i16* %b, i32 %acc) { 35entry: 36 %addr.a.1 = getelementptr i16, i16* %a, i32 1 37 %addr.b.1 = getelementptr i16, i16* %b, i32 1 38 %ld.a.0 = load i16, i16* %a 39 %sext.a.0 = sext i16 %ld.a.0 to i32 40 %ld.b.0 = load i16, i16* %b 41 %ld.a.1 = load i16, i16* %addr.a.1 42 %ld.b.1 = load i16, i16* %addr.b.1 43 %sext.a.1 = sext i16 %ld.a.1 to i32 44 %sext.b.1 = sext i16 %ld.b.1 to i32 45 %sext.b.0 = sext i16 %ld.b.0 to i32 46 %mul.0 = mul i32 %sext.b.1, %sext.a.0 47 %mul.1 = mul i32 %sext.b.0, %sext.a.1 48 %add = add i32 %mul.0, %mul.1 49 %res = add i32 %add, %acc 50 ret i32 %res 51} 52 53; CHECK-LABEL: exchange_3 54; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* 55; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] 56; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* 57; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] 58; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]] 59define i32 @exchange_3(i16* %a, i16* %b, i32 %acc) { 60entry: 61 %addr.a.1 = getelementptr i16, i16* %a, i32 1 62 %addr.b.1 = getelementptr i16, i16* %b, i32 1 63 %ld.a.0 = load i16, i16* %a 64 %sext.a.0 = sext i16 %ld.a.0 to i32 65 %ld.b.0 = load i16, i16* %b 66 %ld.a.1 = load i16, i16* %addr.a.1 67 %ld.b.1 = load i16, i16* %addr.b.1 68 %sext.a.1 = sext i16 %ld.a.1 to i32 69 %sext.b.1 = sext i16 %ld.b.1 to i32 70 %sext.b.0 = sext i16 %ld.b.0 to i32 71 %mul.0 = mul i32 %sext.a.0, %sext.b.1 72 %mul.1 = mul i32 %sext.a.1, %sext.b.0 73 %add = add i32 %mul.1, %mul.0 74 %res = add i32 %add, %acc 75 ret i32 %res 76} 77 78; CHECK-LABEL: exchange_4 79; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* 80; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] 81; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* 82; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] 83; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]] 84define i32 @exchange_4(i16* %a, i16* %b, i32 %acc) { 85entry: 86 %addr.a.1 = getelementptr i16, i16* %a, i32 1 87 %addr.b.1 = getelementptr i16, i16* %b, i32 1 88 %ld.a.0 = load i16, i16* %a 89 %sext.a.0 = sext i16 %ld.a.0 to i32 90 %ld.b.0 = load i16, i16* %b 91 %ld.a.1 = load i16, i16* %addr.a.1 92 %ld.b.1 = load i16, i16* %addr.b.1 93 %sext.a.1 = sext i16 %ld.a.1 to i32 94 %sext.b.1 = sext i16 %ld.b.1 to i32 95 %sext.b.0 = sext i16 %ld.b.0 to i32 96 %mul.0 = mul i32 %sext.b.1, %sext.a.0 97 %mul.1 = mul i32 %sext.b.0, %sext.a.1 98 %add = add i32 %mul.1, %mul.0 99 %res = add i32 %add, %acc 100 ret i32 %res 101} 102 103; CHECK-LABEL: exchange_multi_use_1 104; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* 105; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] 106; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* 107; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] 108; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]], i32 %acc 109; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 110; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* 111; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] 112; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B]], i32 [[X]]) 113define i32 @exchange_multi_use_1(i16* %a, i16* %b, i32 %acc) { 114entry: 115 %addr.a.1 = getelementptr i16, i16* %a, i32 1 116 %addr.b.1 = getelementptr i16, i16* %b, i32 1 117 %ld.a.0 = load i16, i16* %a 118 %sext.a.0 = sext i16 %ld.a.0 to i32 119 %ld.b.0 = load i16, i16* %b 120 %ld.a.1 = load i16, i16* %addr.a.1 121 %ld.b.1 = load i16, i16* %addr.b.1 122 %sext.a.1 = sext i16 %ld.a.1 to i32 123 %sext.b.1 = sext i16 %ld.b.1 to i32 124 %sext.b.0 = sext i16 %ld.b.0 to i32 125 %mul.0 = mul i32 %sext.a.0, %sext.b.1 126 %mul.1 = mul i32 %sext.a.1, %sext.b.0 127 %add = add i32 %mul.0, %mul.1 128 %addr.a.2 = getelementptr i16, i16* %a, i32 2 129 %addr.a.3 = getelementptr i16, i16* %a, i32 3 130 %ld.a.2 = load i16, i16* %addr.a.2 131 %ld.a.3 = load i16, i16* %addr.a.3 132 %sext.a.2 = sext i16 %ld.a.2 to i32 133 %sext.a.3 = sext i16 %ld.a.3 to i32 134 %mul.2 = mul i32 %sext.a.3, %sext.b.1 135 %mul.3 = mul i32 %sext.a.2, %sext.b.0 136 %add.1 = add i32 %mul.2, %mul.3 137 %add.2 = add i32 %add, %add.1 138 %res = add i32 %add.2, %acc 139 ret i32 %res 140} 141 142; CHECK-LABEL: exchange_multi_use_64_1 143; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* 144; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] 145; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* 146; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] 147; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc 148; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 149; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* 150; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] 151; CHECK: call i64 @llvm.arm.smlald(i32 [[LD_A_2]], i32 [[LD_B]], i64 [[X]]) 152define i64 @exchange_multi_use_64_1(i16* %a, i16* %b, i64 %acc) { 153entry: 154 %addr.a.1 = getelementptr i16, i16* %a, i32 1 155 %addr.b.1 = getelementptr i16, i16* %b, i32 1 156 %ld.a.0 = load i16, i16* %a 157 %sext.a.0 = sext i16 %ld.a.0 to i32 158 %ld.b.0 = load i16, i16* %b 159 %ld.a.1 = load i16, i16* %addr.a.1 160 %ld.b.1 = load i16, i16* %addr.b.1 161 %sext.a.1 = sext i16 %ld.a.1 to i32 162 %sext.b.1 = sext i16 %ld.b.1 to i32 163 %sext.b.0 = sext i16 %ld.b.0 to i32 164 %mul.0 = mul i32 %sext.a.0, %sext.b.1 165 %mul.1 = mul i32 %sext.a.1, %sext.b.0 166 %add = add i32 %mul.0, %mul.1 167 %addr.a.2 = getelementptr i16, i16* %a, i32 2 168 %addr.a.3 = getelementptr i16, i16* %a, i32 3 169 %ld.a.2 = load i16, i16* %addr.a.2 170 %ld.a.3 = load i16, i16* %addr.a.3 171 %sext.a.2 = sext i16 %ld.a.2 to i32 172 %sext.a.3 = sext i16 %ld.a.3 to i32 173 %mul.2 = mul i32 %sext.a.3, %sext.b.1 174 %mul.3 = mul i32 %sext.a.2, %sext.b.0 175 %add.1 = add i32 %mul.2, %mul.3 176 %add.2 = add i32 %add, %add.1 177 %sext.add.2 = sext i32 %add.2 to i64 178 %res = add i64 %sext.add.2, %acc 179 ret i64 %res 180} 181 182; CHECK-LABEL: exchange_multi_use_64_2 183; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* 184; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] 185; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* 186; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] 187; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc 188; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 189; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* 190; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] 191; CHECK: call i64 @llvm.arm.smlald(i32 [[LD_A_2]], i32 [[LD_B]], i64 [[X]]) 192define i64 @exchange_multi_use_64_2(i16* %a, i16* %b, i64 %acc) { 193entry: 194 %addr.a.1 = getelementptr i16, i16* %a, i32 1 195 %addr.b.1 = getelementptr i16, i16* %b, i32 1 196 %ld.a.0 = load i16, i16* %a 197 %sext.a.0 = sext i16 %ld.a.0 to i32 198 %ld.b.0 = load i16, i16* %b 199 %ld.a.1 = load i16, i16* %addr.a.1 200 %ld.b.1 = load i16, i16* %addr.b.1 201 %sext.a.1 = sext i16 %ld.a.1 to i32 202 %sext.b.1 = sext i16 %ld.b.1 to i32 203 %sext.b.0 = sext i16 %ld.b.0 to i32 204 %mul.0 = mul i32 %sext.a.0, %sext.b.1 205 %mul.1 = mul i32 %sext.a.1, %sext.b.0 206 %add = add i32 %mul.0, %mul.1 207 %sext.add = sext i32 %add to i64 208 %addr.a.2 = getelementptr i16, i16* %a, i32 2 209 %addr.a.3 = getelementptr i16, i16* %a, i32 3 210 %ld.a.2 = load i16, i16* %addr.a.2 211 %ld.a.3 = load i16, i16* %addr.a.3 212 %sext.a.2 = sext i16 %ld.a.2 to i32 213 %sext.a.3 = sext i16 %ld.a.3 to i32 214 %mul.2 = mul i32 %sext.a.3, %sext.b.1 215 %mul.3 = mul i32 %sext.a.2, %sext.b.0 216 %add.1 = add i32 %mul.2, %mul.3 217 %sext.add.1 = sext i32 %add.1 to i64 218 %add.2 = add i64 %sext.add, %sext.add.1 219 %res = add i64 %add.2, %acc 220 ret i64 %res 221} 222 223; CHECK-LABEL: exchange_multi_use_2 224; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* 225; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] 226; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* 227; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] 228; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc 229; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 230; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* 231; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] 232; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 [[X]]) 233define i32 @exchange_multi_use_2(i16* %a, i16* %b, i32 %acc) { 234entry: 235 %addr.a.1 = getelementptr i16, i16* %a, i32 1 236 %addr.b.1 = getelementptr i16, i16* %b, i32 1 237 %ld.a.0 = load i16, i16* %a 238 %sext.a.0 = sext i16 %ld.a.0 to i32 239 %ld.b.0 = load i16, i16* %b 240 %ld.a.1 = load i16, i16* %addr.a.1 241 %ld.b.1 = load i16, i16* %addr.b.1 242 %sext.a.1 = sext i16 %ld.a.1 to i32 243 %sext.b.1 = sext i16 %ld.b.1 to i32 244 %sext.b.0 = sext i16 %ld.b.0 to i32 245 %mul.0 = mul i32 %sext.a.0, %sext.b.0 246 %mul.1 = mul i32 %sext.a.1, %sext.b.1 247 %add = add i32 %mul.0, %mul.1 248 %addr.a.2 = getelementptr i16, i16* %a, i32 2 249 %addr.a.3 = getelementptr i16, i16* %a, i32 3 250 %ld.a.2 = load i16, i16* %addr.a.2 251 %ld.a.3 = load i16, i16* %addr.a.3 252 %sext.a.2 = sext i16 %ld.a.2 to i32 253 %sext.a.3 = sext i16 %ld.a.3 to i32 254 %mul.2 = mul i32 %sext.b.0, %sext.a.3 255 %mul.3 = mul i32 %sext.b.1, %sext.a.2 256 %add.1 = add i32 %mul.2, %mul.3 257 %add.2 = add i32 %add, %add.1 258 %res = add i32 %add.2, %acc 259 ret i32 %res 260} 261 262; TODO: Why aren't two intrinsics generated? 263; CHECK-LABEL: exchange_multi_use_3 264; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* 265; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] 266; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 267; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* 268; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] 269; CHECK-NOT: call i32 @llvm.arm.smlad 270; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 0 271define i32 @exchange_multi_use_3(i16* %a, i16* %b, i32 %acc) { 272entry: 273 %addr.a.1 = getelementptr i16, i16* %a, i32 1 274 %addr.b.1 = getelementptr i16, i16* %b, i32 1 275 %ld.a.0 = load i16, i16* %a 276 %sext.a.0 = sext i16 %ld.a.0 to i32 277 %ld.b.0 = load i16, i16* %b 278 %ld.a.1 = load i16, i16* %addr.a.1 279 %ld.b.1 = load i16, i16* %addr.b.1 280 %sext.a.1 = sext i16 %ld.a.1 to i32 281 %sext.b.1 = sext i16 %ld.b.1 to i32 282 %sext.b.0 = sext i16 %ld.b.0 to i32 283 %addr.a.2 = getelementptr i16, i16* %a, i32 2 284 %addr.a.3 = getelementptr i16, i16* %a, i32 3 285 %ld.a.2 = load i16, i16* %addr.a.2 286 %ld.a.3 = load i16, i16* %addr.a.3 287 %sext.a.2 = sext i16 %ld.a.2 to i32 288 %sext.a.3 = sext i16 %ld.a.3 to i32 289 %mul.2 = mul i32 %sext.b.0, %sext.a.3 290 %mul.3 = mul i32 %sext.b.1, %sext.a.2 291 %mul.0 = mul i32 %sext.a.0, %sext.b.0 292 %mul.1 = mul i32 %sext.a.1, %sext.b.1 293 %add = add i32 %mul.0, %mul.1 294 %add.1 = add i32 %mul.2, %mul.3 295 %sub = sub i32 %add, %add.1 296 %res = add i32 %acc, %sub 297 ret i32 %res 298} 299 300; TODO: Would it be better to generate a smlad and then sign extend it? 301; CHECK-LABEL: exchange_multi_use_64_3 302; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* 303; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] 304; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* 305; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] 306; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 307; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* 308; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] 309; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_B]], i32 [[LD_A_2]], i64 0) 310; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 [[ACC]]) 311define i64 @exchange_multi_use_64_3(i16* %a, i16* %b, i64 %acc) { 312entry: 313 %addr.a.1 = getelementptr i16, i16* %a, i32 1 314 %addr.b.1 = getelementptr i16, i16* %b, i32 1 315 %ld.a.0 = load i16, i16* %a 316 %sext.a.0 = sext i16 %ld.a.0 to i32 317 %ld.b.0 = load i16, i16* %b 318 %ld.a.1 = load i16, i16* %addr.a.1 319 %ld.b.1 = load i16, i16* %addr.b.1 320 %sext.a.1 = sext i16 %ld.a.1 to i32 321 %sext.b.1 = sext i16 %ld.b.1 to i32 322 %sext.b.0 = sext i16 %ld.b.0 to i32 323 %addr.a.2 = getelementptr i16, i16* %a, i32 2 324 %addr.a.3 = getelementptr i16, i16* %a, i32 3 325 %ld.a.2 = load i16, i16* %addr.a.2 326 %ld.a.3 = load i16, i16* %addr.a.3 327 %sext.a.2 = sext i16 %ld.a.2 to i32 328 %sext.a.3 = sext i16 %ld.a.3 to i32 329 %mul.2 = mul i32 %sext.b.0, %sext.a.3 330 %mul.3 = mul i32 %sext.b.1, %sext.a.2 331 %mul.0 = mul i32 %sext.a.0, %sext.b.0 332 %mul.1 = mul i32 %sext.a.1, %sext.b.1 333 %add = add i32 %mul.0, %mul.1 334 %add.1 = add i32 %mul.2, %mul.3 335 %sext.add = sext i32 %add to i64 336 %sext.add.1 = sext i32 %add.1 to i64 337 %add.2 = add i64 %sext.add, %sext.add.1 338 %res = sub i64 %acc, %add.2 339 ret i64 %res 340} 341 342; TODO: Why isn't smladx generated too? 343; CHECK-LABEL: exchange_multi_use_4 344; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* 345; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] 346; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* 347; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] 348; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 0 349; CHECK-NOT: call i32 @llvm.arm.smlad 350define i32 @exchange_multi_use_4(i16* %a, i16* %b, i32 %acc) { 351entry: 352 %addr.a.1 = getelementptr i16, i16* %a, i32 1 353 %addr.b.1 = getelementptr i16, i16* %b, i32 1 354 %ld.a.0 = load i16, i16* %a 355 %sext.a.0 = sext i16 %ld.a.0 to i32 356 %ld.b.0 = load i16, i16* %b 357 %ld.a.1 = load i16, i16* %addr.a.1 358 %ld.b.1 = load i16, i16* %addr.b.1 359 %sext.a.1 = sext i16 %ld.a.1 to i32 360 %sext.b.1 = sext i16 %ld.b.1 to i32 361 %sext.b.0 = sext i16 %ld.b.0 to i32 362 %addr.a.2 = getelementptr i16, i16* %a, i32 2 363 %addr.a.3 = getelementptr i16, i16* %a, i32 3 364 %ld.a.2 = load i16, i16* %addr.a.2 365 %ld.a.3 = load i16, i16* %addr.a.3 366 %sext.a.2 = sext i16 %ld.a.2 to i32 367 %sext.a.3 = sext i16 %ld.a.3 to i32 368 %mul.2 = mul i32 %sext.b.0, %sext.a.3 369 %mul.3 = mul i32 %sext.b.1, %sext.a.2 370 %mul.0 = mul i32 %sext.a.0, %sext.b.0 371 %mul.1 = mul i32 %sext.a.1, %sext.b.1 372 %add.1 = add i32 %mul.2, %mul.3 373 %add = add i32 %mul.0, %mul.1 374 %sub = sub i32 %add, %add.1 375 %res = add i32 %acc, %sub 376 ret i32 %res 377} 378 379; CHECK-LABEL: exchange_swap 380; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* 381; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] 382; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* 383; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] 384; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]] 385define i32 @exchange_swap(i16* %a, i16* %b, i32 %acc) { 386entry: 387 %addr.a.1 = getelementptr i16, i16* %a, i32 1 388 %addr.b.1 = getelementptr i16, i16* %b, i32 1 389 %ld.a.0 = load i16, i16* %a 390 %sext.a.0 = sext i16 %ld.a.0 to i32 391 %ld.b.0 = load i16, i16* %b 392 %ld.a.1 = load i16, i16* %addr.a.1 393 %ld.b.1 = load i16, i16* %addr.b.1 394 %sext.a.1 = sext i16 %ld.a.1 to i32 395 %sext.b.1 = sext i16 %ld.b.1 to i32 396 %sext.b.0 = sext i16 %ld.b.0 to i32 397 %mul.0 = mul i32 %sext.a.1, %sext.b.0 398 %mul.1 = mul i32 %sext.a.0, %sext.b.1 399 %add = add i32 %mul.0, %mul.1 400 %res = add i32 %add, %acc 401 ret i32 %res 402} 403 404; CHECK-LABEL: exchange_swap_2 405; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* 406; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] 407; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* 408; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] 409; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] 410define i32 @exchange_swap_2(i16* %a, i16* %b, i32 %acc) { 411entry: 412 %addr.a.1 = getelementptr i16, i16* %a, i32 1 413 %addr.b.1 = getelementptr i16, i16* %b, i32 1 414 %ld.a.0 = load i16, i16* %a 415 %sext.a.0 = sext i16 %ld.a.0 to i32 416 %ld.b.0 = load i16, i16* %b 417 %ld.a.1 = load i16, i16* %addr.a.1 418 %ld.b.1 = load i16, i16* %addr.b.1 419 %sext.a.1 = sext i16 %ld.a.1 to i32 420 %sext.b.1 = sext i16 %ld.b.1 to i32 421 %sext.b.0 = sext i16 %ld.b.0 to i32 422 %mul.0 = mul i32 %sext.a.1, %sext.b.0 423 %mul.1 = mul i32 %sext.a.0, %sext.b.1 424 %add = add i32 %mul.1, %mul.0 425 %res = add i32 %add, %acc 426 ret i32 %res 427} 428 429; CHECK-LABEL: exchange_swap_3 430; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* 431; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] 432; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* 433; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] 434; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] 435define i32 @exchange_swap_3(i16* %a, i16* %b, i32 %acc) { 436entry: 437 %addr.a.1 = getelementptr i16, i16* %a, i32 1 438 %addr.b.1 = getelementptr i16, i16* %b, i32 1 439 %ld.a.0 = load i16, i16* %a 440 %sext.a.0 = sext i16 %ld.a.0 to i32 441 %ld.b.0 = load i16, i16* %b 442 %ld.a.1 = load i16, i16* %addr.a.1 443 %ld.b.1 = load i16, i16* %addr.b.1 444 %sext.a.1 = sext i16 %ld.a.1 to i32 445 %sext.b.1 = sext i16 %ld.b.1 to i32 446 %sext.b.0 = sext i16 %ld.b.0 to i32 447 %mul.0 = mul i32 %sext.b.0, %sext.a.1 448 %mul.1 = mul i32 %sext.b.1, %sext.a.0 449 %add = add i32 %mul.1, %mul.0 450 %res = add i32 %add, %acc 451 ret i32 %res 452} 453