1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -o - -S -load-store-vectorizer -dce %s | FileCheck %s 3 4; Make sure LoadStoreVectorizer vectorizes the loads below. 5; In order to prove that the vectorization is safe, it tries to 6; match nested adds and find an expression that adds a constant 7; value to an existing index and the result doesn't overflow. 8 9target triple = "x86_64--" 10 11define void @ld_v4i8_add_nsw(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) { 12; CHECK-LABEL: @ld_v4i8_add_nsw( 13; CHECK-NEXT: bb: 14; CHECK-NEXT: [[TMP:%.*]] = add nsw i32 [[V0:%.*]], -1 15; CHECK-NEXT: [[TMP1:%.*]] = add nsw i32 [[V1:%.*]], [[TMP]] 16; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 17; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]] 18; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>* 19; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 20; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 21; CHECK-NEXT: [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 22; CHECK-NEXT: [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 23; CHECK-NEXT: [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 24; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0 25; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1 26; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2 27; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3 28; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]] 29; CHECK-NEXT: ret void 30; 31bb: 32 %tmp = add nsw i32 %v0, -1 33 %tmp1 = add nsw i32 %v1, %tmp 34 %tmp2 = sext i32 %tmp1 to i64 35 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 36 %tmp4 = load i8, i8* %tmp3, align 1 37 %tmp5 = add nsw i32 %v1, %v0 38 %tmp6 = sext i32 %tmp5 to i64 39 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 40 %tmp8 = load i8, i8* %tmp7, align 1 41 %tmp9 = add nsw i32 %v0, 1 42 %tmp10 = add nsw i32 %v1, %tmp9 43 %tmp11 = sext i32 %tmp10 to i64 44 %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11 45 %tmp13 = load i8, i8* %tmp12, align 1 46 %tmp14 = add nsw i32 %v0, 2 47 %tmp15 = add nsw i32 %v1, %tmp14 48 %tmp16 = sext i32 %tmp15 to i64 49 %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16 50 %tmp18 = load i8, i8* %tmp17, align 1 51 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 52 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 53 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 54 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 55 store <4 x i8> %tmp22, <4 x i8>* %dst 56 ret void 57} 58 59define void @ld_v4i8_add_nuw(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) { 60; CHECK-LABEL: @ld_v4i8_add_nuw( 61; CHECK-NEXT: bb: 62; CHECK-NEXT: [[TMP:%.*]] = add nuw i32 [[V0:%.*]], -1 63; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[V1:%.*]], [[TMP]] 64; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 65; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]] 66; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>* 67; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 68; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 69; CHECK-NEXT: [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 70; CHECK-NEXT: [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 71; CHECK-NEXT: [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 72; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0 73; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1 74; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2 75; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3 76; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]] 77; CHECK-NEXT: ret void 78; 79bb: 80 %tmp = add nuw i32 %v0, -1 81 %tmp1 = add nuw i32 %v1, %tmp 82 %tmp2 = zext i32 %tmp1 to i64 83 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 84 %tmp4 = load i8, i8* %tmp3, align 1 85 %tmp5 = add nuw i32 %v1, %v0 86 %tmp6 = zext i32 %tmp5 to i64 87 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 88 %tmp8 = load i8, i8* %tmp7, align 1 89 %tmp9 = add nuw i32 %v0, 1 90 %tmp10 = add nuw i32 %v1, %tmp9 91 %tmp11 = zext i32 %tmp10 to i64 92 %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11 93 %tmp13 = load i8, i8* %tmp12, align 1 94 %tmp14 = add nuw i32 %v0, 2 95 %tmp15 = add nuw i32 %v1, %tmp14 96 %tmp16 = zext i32 %tmp15 to i64 97 %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16 98 %tmp18 = load i8, i8* %tmp17, align 1 99 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 100 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 101 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 102 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 103 store <4 x i8> %tmp22, <4 x i8>* %dst 104 ret void 105} 106 107; Apply different operand orders for the nested add sequences 108define void @ld_v4i8_add_nsw_operand_orders(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) { 109; CHECK-LABEL: @ld_v4i8_add_nsw_operand_orders( 110; CHECK-NEXT: bb: 111; CHECK-NEXT: [[TMP:%.*]] = add nsw i32 [[V0:%.*]], -1 112; CHECK-NEXT: [[TMP1:%.*]] = add nsw i32 [[V1:%.*]], [[TMP]] 113; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 114; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]] 115; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>* 116; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 117; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 118; CHECK-NEXT: [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 119; CHECK-NEXT: [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 120; CHECK-NEXT: [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 121; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0 122; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1 123; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2 124; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3 125; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]] 126; CHECK-NEXT: ret void 127; 128bb: 129 %tmp = add nsw i32 %v0, -1 130 %tmp1 = add nsw i32 %v1, %tmp 131 %tmp2 = sext i32 %tmp1 to i64 132 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 133 %tmp4 = load i8, i8* %tmp3, align 1 134 %tmp5 = add nsw i32 %v0, %v1 135 %tmp6 = sext i32 %tmp5 to i64 136 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 137 %tmp8 = load i8, i8* %tmp7, align 1 138 %tmp9 = add nsw i32 %v0, 1 139 %tmp10 = add nsw i32 %tmp9, %v1 140 %tmp11 = sext i32 %tmp10 to i64 141 %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11 142 %tmp13 = load i8, i8* %tmp12, align 1 143 %tmp14 = add nsw i32 %v0, 2 144 %tmp15 = add nsw i32 %v1, %tmp14 145 %tmp16 = sext i32 %tmp15 to i64 146 %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16 147 %tmp18 = load i8, i8* %tmp17, align 1 148 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 149 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 150 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 151 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 152 store <4 x i8> %tmp22, <4 x i8>* %dst 153 ret void 154} 155 156; Apply different operand orders for the nested add sequences 157define void @ld_v4i8_add_nuw_operand_orders(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) { 158; CHECK-LABEL: @ld_v4i8_add_nuw_operand_orders( 159; CHECK-NEXT: bb: 160; CHECK-NEXT: [[TMP:%.*]] = add nuw i32 [[V0:%.*]], -1 161; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[V1:%.*]], [[TMP]] 162; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 163; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]] 164; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>* 165; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 166; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 167; CHECK-NEXT: [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 168; CHECK-NEXT: [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 169; CHECK-NEXT: [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 170; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0 171; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1 172; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2 173; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3 174; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]] 175; CHECK-NEXT: ret void 176; 177bb: 178 %tmp = add nuw i32 %v0, -1 179 %tmp1 = add nuw i32 %v1, %tmp 180 %tmp2 = zext i32 %tmp1 to i64 181 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 182 %tmp4 = load i8, i8* %tmp3, align 1 183 %tmp5 = add nuw i32 %v0, %v1 184 %tmp6 = zext i32 %tmp5 to i64 185 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 186 %tmp8 = load i8, i8* %tmp7, align 1 187 %tmp9 = add nuw i32 %v0, 1 188 %tmp10 = add nuw i32 %tmp9, %v1 189 %tmp11 = zext i32 %tmp10 to i64 190 %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11 191 %tmp13 = load i8, i8* %tmp12, align 1 192 %tmp14 = add nuw i32 %v0, 2 193 %tmp15 = add nuw i32 %v1, %tmp14 194 %tmp16 = zext i32 %tmp15 to i64 195 %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16 196 %tmp18 = load i8, i8* %tmp17, align 1 197 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 198 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 199 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 200 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 201 store <4 x i8> %tmp22, <4 x i8>* %dst 202 ret void 203} 204 205define void @ld_v4i8_add_known_bits(i32 %ind0, i32 %ind1, i8* %src, <4 x i8>* %dst) { 206; CHECK-LABEL: @ld_v4i8_add_known_bits( 207; CHECK-NEXT: bb: 208; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 4 209; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 4 210; CHECK-NEXT: [[TMP:%.*]] = add i32 [[V0]], -1 211; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[V1]], [[TMP]] 212; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 213; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]] 214; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[TMP3]], align 1 215; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] 216; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 217; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP6]] 218; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <3 x i8>* 219; CHECK-NEXT: [[TMP1:%.*]] = load <3 x i8>, <3 x i8>* [[TMP0]], align 1 220; CHECK-NEXT: [[TMP81:%.*]] = extractelement <3 x i8> [[TMP1]], i32 0 221; CHECK-NEXT: [[TMP132:%.*]] = extractelement <3 x i8> [[TMP1]], i32 1 222; CHECK-NEXT: [[TMP183:%.*]] = extractelement <3 x i8> [[TMP1]], i32 2 223; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP4]], i32 0 224; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1 225; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2 226; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3 227; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]] 228; CHECK-NEXT: ret void 229; 230bb: 231 %v0 = mul i32 %ind0, 4 232 %v1 = mul i32 %ind1, 4 233 %tmp = add i32 %v0, -1 234 %tmp1 = add i32 %v1, %tmp 235 %tmp2 = sext i32 %tmp1 to i64 236 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 237 %tmp4 = load i8, i8* %tmp3, align 1 238 %tmp5 = add i32 %v1, %v0 239 %tmp6 = sext i32 %tmp5 to i64 240 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 241 %tmp8 = load i8, i8* %tmp7, align 1 242 %tmp9 = add i32 %v0, 1 243 %tmp10 = add i32 %v1, %tmp9 244 %tmp11 = sext i32 %tmp10 to i64 245 %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11 246 %tmp13 = load i8, i8* %tmp12, align 1 247 %tmp14 = add i32 %v0, 2 248 %tmp15 = add i32 %v1, %tmp14 249 %tmp16 = sext i32 %tmp15 to i64 250 %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16 251 %tmp18 = load i8, i8* %tmp17, align 1 252 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 253 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 254 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 255 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 256 store <4 x i8> %tmp22, <4 x i8>* %dst 257 ret void 258} 259 260define void @ld_v4i8_add_known_bits1(i32 %ind0, i32 %ind1, i8* %src, <4 x i8>* %dst) { 261; CHECK-LABEL: @ld_v4i8_add_known_bits1( 262; CHECK-NEXT: bb: 263; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 4 264; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 4 265; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] 266; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 267; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP6]] 268; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>* 269; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 270; CHECK-NEXT: [[TMP81:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 271; CHECK-NEXT: [[TMP132:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 272; CHECK-NEXT: [[TMP183:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 273; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 274; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP44]], i32 0 275; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1 276; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2 277; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3 278; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]] 279; CHECK-NEXT: ret void 280; 281bb: 282 %v0 = mul i32 %ind0, 4 283 %v1 = mul i32 %ind1, 4 284 %tmp = add i32 %v0, 3 285 %tmp1 = add i32 %v1, %tmp 286 %tmp2 = sext i32 %tmp1 to i64 287 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 288 %tmp4 = load i8, i8* %tmp3, align 1 289 %tmp5 = add i32 %v1, %v0 290 %tmp6 = sext i32 %tmp5 to i64 291 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 292 %tmp8 = load i8, i8* %tmp7, align 1 293 %tmp9 = add i32 %v0, 1 294 %tmp10 = add i32 %v1, %tmp9 295 %tmp11 = sext i32 %tmp10 to i64 296 %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11 297 %tmp13 = load i8, i8* %tmp12, align 1 298 %tmp14 = add i32 %v0, 2 299 %tmp15 = add i32 %v1, %tmp14 300 %tmp16 = sext i32 %tmp15 to i64 301 %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16 302 %tmp18 = load i8, i8* %tmp17, align 1 303 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 304 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 305 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 306 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 307 store <4 x i8> %tmp22, <4 x i8>* %dst 308 ret void 309} 310 311define void @ld_v4i8_add_known_bits_by_assume(i32 %ind0, i32 %ind1, i8* %src, <4 x i8>* %dst) { 312; CHECK-LABEL: @ld_v4i8_add_known_bits_by_assume( 313; CHECK-NEXT: bb: 314; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 3 315; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 3 316; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[V0]], 3 317; CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i32 [[AND_I]], 0 318; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[V1]], 3 319; CHECK-NEXT: [[CMP_I_1:%.*]] = icmp eq i32 [[AND_I_1]], 0 320; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_I]]) 321; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_I_1]]) 322; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] 323; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 324; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP6]] 325; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>* 326; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 327; CHECK-NEXT: [[TMP81:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 328; CHECK-NEXT: [[TMP132:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 329; CHECK-NEXT: [[TMP183:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 330; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 331; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP44]], i32 0 332; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1 333; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2 334; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3 335; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]] 336; CHECK-NEXT: ret void 337; 338bb: 339 %v0 = mul i32 %ind0, 3 340 %v1 = mul i32 %ind1, 3 341 %and.i = and i32 %v0, 3 342 %cmp.i = icmp eq i32 %and.i, 0 343 %and.i.1 = and i32 %v1, 3 344 %cmp.i.1 = icmp eq i32 %and.i.1, 0 345 call void @llvm.assume(i1 %cmp.i) 346 call void @llvm.assume(i1 %cmp.i.1) 347 %tmp = add i32 %v0, 3 348 %tmp1 = add i32 %v1, %tmp 349 %tmp2 = sext i32 %tmp1 to i64 350 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 351 %tmp4 = load i8, i8* %tmp3, align 1 352 %tmp5 = add i32 %v1, %v0 353 %tmp6 = sext i32 %tmp5 to i64 354 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 355 %tmp8 = load i8, i8* %tmp7, align 1 356 %tmp9 = add i32 %v0, 1 357 %tmp10 = add i32 %v1, %tmp9 358 %tmp11 = sext i32 %tmp10 to i64 359 %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11 360 %tmp13 = load i8, i8* %tmp12, align 1 361 %tmp14 = add i32 %v0, 2 362 %tmp15 = add i32 %v1, %tmp14 363 %tmp16 = sext i32 %tmp15 to i64 364 %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16 365 %tmp18 = load i8, i8* %tmp17, align 1 366 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 367 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 368 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 369 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 370 store <4 x i8> %tmp22, <4 x i8>* %dst 371 ret void 372} 373 374declare void @llvm.assume(i1) 375 376define void @ld_v4i8_add_assume_on_arg(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) { 377; CHECK-LABEL: @ld_v4i8_add_assume_on_arg( 378; CHECK-NEXT: bb: 379; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[V0:%.*]], 3 380; CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i32 [[AND_I]], 0 381; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[V1:%.*]], 3 382; CHECK-NEXT: [[CMP_I_1:%.*]] = icmp eq i32 [[AND_I_1]], 0 383; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_I]]) 384; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_I_1]]) 385; CHECK-NEXT: [[TMP:%.*]] = add nsw i32 [[V0]], -1 386; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[V1]], [[TMP]] 387; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 388; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]] 389; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[TMP3]], align 1 390; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] 391; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 392; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP6]] 393; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <3 x i8>* 394; CHECK-NEXT: [[TMP1:%.*]] = load <3 x i8>, <3 x i8>* [[TMP0]], align 1 395; CHECK-NEXT: [[TMP81:%.*]] = extractelement <3 x i8> [[TMP1]], i32 0 396; CHECK-NEXT: [[TMP132:%.*]] = extractelement <3 x i8> [[TMP1]], i32 1 397; CHECK-NEXT: [[TMP183:%.*]] = extractelement <3 x i8> [[TMP1]], i32 2 398; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP4]], i32 0 399; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1 400; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2 401; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3 402; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]] 403; CHECK-NEXT: ret void 404; 405bb: 406 %and.i = and i32 %v0, 3 407 %cmp.i = icmp eq i32 %and.i, 0 408 %and.i.1 = and i32 %v1, 3 409 %cmp.i.1 = icmp eq i32 %and.i.1, 0 410 call void @llvm.assume(i1 %cmp.i) 411 call void @llvm.assume(i1 %cmp.i.1) 412 %tmp = add nsw i32 %v0, -1 413 %tmp1 = add i32 %v1, %tmp 414 %tmp2 = sext i32 %tmp1 to i64 415 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 416 %tmp4 = load i8, i8* %tmp3, align 1 417 %tmp5 = add i32 %v1, %v0 418 %tmp6 = sext i32 %tmp5 to i64 419 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 420 %tmp8 = load i8, i8* %tmp7, align 1 421 %tmp9 = add nsw i32 %v0, 1 422 %tmp10 = add i32 %v1, %tmp9 423 %tmp11 = sext i32 %tmp10 to i64 424 %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11 425 %tmp13 = load i8, i8* %tmp12, align 1 426 %tmp14 = add nsw i32 %v0, 2 427 %tmp15 = add i32 %v1, %tmp14 428 %tmp16 = sext i32 %tmp15 to i64 429 %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16 430 %tmp18 = load i8, i8* %tmp17, align 1 431 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 432 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 433 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 434 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 435 store <4 x i8> %tmp22, <4 x i8>* %dst 436 ret void 437} 438 439define void @ld_v4i8_add_assume_on_arg1(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) { 440; CHECK-LABEL: @ld_v4i8_add_assume_on_arg1( 441; CHECK-NEXT: bb: 442; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[V0:%.*]], 3 443; CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i32 [[AND_I]], 0 444; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[V1:%.*]], 3 445; CHECK-NEXT: [[CMP_I_1:%.*]] = icmp eq i32 [[AND_I_1]], 0 446; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_I]]) 447; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_I_1]]) 448; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] 449; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 450; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP6]] 451; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>* 452; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 453; CHECK-NEXT: [[TMP81:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 454; CHECK-NEXT: [[TMP132:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 455; CHECK-NEXT: [[TMP183:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 456; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 457; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP44]], i32 0 458; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1 459; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2 460; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3 461; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]] 462; CHECK-NEXT: ret void 463; 464bb: 465 %and.i = and i32 %v0, 3 466 %cmp.i = icmp eq i32 %and.i, 0 467 %and.i.1 = and i32 %v1, 3 468 %cmp.i.1 = icmp eq i32 %and.i.1, 0 469 call void @llvm.assume(i1 %cmp.i) 470 call void @llvm.assume(i1 %cmp.i.1) 471 %tmp = add nsw i32 %v0, 3 472 %tmp1 = add i32 %v1, %tmp 473 %tmp2 = sext i32 %tmp1 to i64 474 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 475 %tmp4 = load i8, i8* %tmp3, align 1 476 %tmp5 = add i32 %v1, %v0 477 %tmp6 = sext i32 %tmp5 to i64 478 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 479 %tmp8 = load i8, i8* %tmp7, align 1 480 %tmp9 = add nsw i32 %v0, 1 481 %tmp10 = add i32 %v1, %tmp9 482 %tmp11 = sext i32 %tmp10 to i64 483 %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11 484 %tmp13 = load i8, i8* %tmp12, align 1 485 %tmp14 = add nsw i32 %v0, 2 486 %tmp15 = add i32 %v1, %tmp14 487 %tmp16 = sext i32 %tmp15 to i64 488 %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16 489 %tmp18 = load i8, i8* %tmp17, align 1 490 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 491 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 492 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 493 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 494 store <4 x i8> %tmp22, <4 x i8>* %dst 495 ret void 496} 497 498; Address computations are partly separated by control flow and with llvm.assume placed 499; in the second basic block 500 501define void @ld_v2i8_add_different_contexts(i32 %ind0, i32 %ind1, i8* %src, <2 x i8>* %dst) { 502; CHECK-LABEL: @ld_v2i8_add_different_contexts( 503; CHECK-NEXT: bb: 504; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 4 505; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 3 506; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] 507; CHECK-NEXT: [[BIT_COND:%.*]] = icmp eq i32 [[V1]], 0 508; CHECK-NEXT: br i1 [[BIT_COND]], label [[BB_LOADS:%.*]], label [[BB_SKIP:%.*]] 509; CHECK: bb.loads: 510; CHECK-NEXT: call void @llvm.assume(i1 [[BIT_COND]]) 511; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 512; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP6]] 513; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <2 x i8>* 514; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 1 515; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0 516; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1 517; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0 518; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1 519; CHECK-NEXT: store <2 x i8> [[TMP20]], <2 x i8>* [[DST:%.*]] 520; CHECK-NEXT: br label [[BB_SKIP]] 521; CHECK: bb.skip: 522; CHECK-NEXT: ret void 523; 524bb: 525 %v0 = mul i32 %ind0, 4 526 %v1 = mul i32 %ind1, 3 527 %tmp5 = add i32 %v1, %v0 528 %bit_cond = icmp eq i32 %v1, 0 529 br i1 %bit_cond, label %bb.loads, label %bb.skip 530 531bb.loads: 532 call void @llvm.assume(i1 %bit_cond) 533 %tmp = add nsw i32 %v0, 1 534 %tmp1 = add i32 %v1, %tmp 535 %tmp2 = sext i32 %tmp1 to i64 536 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 537 %tmp4 = load i8, i8* %tmp3, align 1 538 %tmp6 = sext i32 %tmp5 to i64 539 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 540 %tmp8 = load i8, i8* %tmp7, align 1 541 %tmp19 = insertelement <2 x i8> undef, i8 %tmp4, i32 0 542 %tmp20 = insertelement <2 x i8> %tmp19, i8 %tmp8, i32 1 543 store <2 x i8> %tmp20, <2 x i8>* %dst 544 br label %bb.skip 545 546bb.skip: 547 ret void 548} 549 550; Same as ld_v2i8_add_different_contexts but with llvm.assume placed between loads 551 552define void @ld_v2i8_add_different_contexts1(i32 %ind0, i32 %ind1, i8* %src, <2 x i8>* %dst) { 553; CHECK-LABEL: @ld_v2i8_add_different_contexts1( 554; CHECK-NEXT: bb: 555; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 4 556; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 3 557; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] 558; CHECK-NEXT: [[BIT_COND:%.*]] = icmp eq i32 [[V1]], 0 559; CHECK-NEXT: br i1 [[BIT_COND]], label [[BB_LOADS:%.*]], label [[BB_SKIP:%.*]] 560; CHECK: bb.loads: 561; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 562; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP6]] 563; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <2 x i8>* 564; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 1 565; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0 566; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1 567; CHECK-NEXT: call void @llvm.assume(i1 [[BIT_COND]]) 568; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0 569; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1 570; CHECK-NEXT: store <2 x i8> [[TMP20]], <2 x i8>* [[DST:%.*]] 571; CHECK-NEXT: br label [[BB_SKIP]] 572; CHECK: bb.skip: 573; CHECK-NEXT: ret void 574; 575bb: 576 %v0 = mul i32 %ind0, 4 577 %v1 = mul i32 %ind1, 3 578 %tmp5 = add i32 %v1, %v0 579 %bit_cond = icmp eq i32 %v1, 0 580 br i1 %bit_cond, label %bb.loads, label %bb.skip 581 582bb.loads: 583 %tmp6 = sext i32 %tmp5 to i64 584 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 585 %tmp8 = load i8, i8* %tmp7, align 1 586 call void @llvm.assume(i1 %bit_cond) 587 %tmp = add nsw i32 %v0, 1 588 %tmp1 = add i32 %v1, %tmp 589 %tmp2 = sext i32 %tmp1 to i64 590 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 591 %tmp4 = load i8, i8* %tmp3, align 1 592 %tmp19 = insertelement <2 x i8> undef, i8 %tmp4, i32 0 593 %tmp20 = insertelement <2 x i8> %tmp19, i8 %tmp8, i32 1 594 store <2 x i8> %tmp20, <2 x i8>* %dst 595 br label %bb.skip 596 597bb.skip: 598 ret void 599} 600 601; llvm.assume is placed between loads in a single basic block 602 603define void @ld_v2i8_add_context(i32 %ind0, i32 %ind1, i8* %src, <2 x i8>* %dst) { 604; CHECK-LABEL: @ld_v2i8_add_context( 605; CHECK-NEXT: bb: 606; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 4 607; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 3 608; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] 609; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 610; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP6]] 611; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <2 x i8>* 612; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 1 613; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0 614; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1 615; CHECK-NEXT: [[BIT_COND:%.*]] = icmp eq i32 [[TMP5]], 0 616; CHECK-NEXT: call void @llvm.assume(i1 [[BIT_COND]]) 617; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0 618; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1 619; CHECK-NEXT: store <2 x i8> [[TMP20]], <2 x i8>* [[DST:%.*]] 620; CHECK-NEXT: ret void 621; 622bb: 623 %v0 = mul i32 %ind0, 4 624 %v1 = mul i32 %ind1, 3 625 %tmp5 = add i32 %v1, %v0 626 %tmp6 = sext i32 %tmp5 to i64 627 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 628 %tmp8 = load i8, i8* %tmp7, align 1 629 %bit_cond = icmp eq i32 %tmp5, 0 630 call void @llvm.assume(i1 %bit_cond) 631 %tmp = add nsw i32 %v0, 1 632 %tmp1 = add i32 %v1, %tmp 633 %tmp2 = sext i32 %tmp1 to i64 634 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 635 %tmp4 = load i8, i8* %tmp3, align 1 636 %tmp19 = insertelement <2 x i8> undef, i8 %tmp4, i32 0 637 %tmp20 = insertelement <2 x i8> %tmp19, i8 %tmp8, i32 1 638 store <2 x i8> %tmp20, <2 x i8>* %dst 639 ret void 640} 641 642; Placing llvm.assume after all the loads and stores in the basic block still works 643 644define void @ld_v2i8_add_context1(i32 %ind0, i32 %ind1, i8* %src, <2 x i8>* %dst) { 645; CHECK-LABEL: @ld_v2i8_add_context1( 646; CHECK-NEXT: bb: 647; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 4 648; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 3 649; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] 650; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 651; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP6]] 652; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <2 x i8>* 653; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 1 654; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0 655; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1 656; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0 657; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1 658; CHECK-NEXT: store <2 x i8> [[TMP20]], <2 x i8>* [[DST:%.*]] 659; CHECK-NEXT: [[BIT_COND:%.*]] = icmp eq i32 [[TMP5]], 0 660; CHECK-NEXT: call void @llvm.assume(i1 [[BIT_COND]]) 661; CHECK-NEXT: ret void 662; 663bb: 664 %v0 = mul i32 %ind0, 4 665 %v1 = mul i32 %ind1, 3 666 %tmp5 = add i32 %v1, %v0 667 %tmp6 = sext i32 %tmp5 to i64 668 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 669 %tmp8 = load i8, i8* %tmp7, align 1 670 %tmp = add nsw i32 %v0, 1 671 %tmp1 = add i32 %v1, %tmp 672 %tmp2 = sext i32 %tmp1 to i64 673 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 674 %tmp4 = load i8, i8* %tmp3, align 1 675 %tmp19 = insertelement <2 x i8> undef, i8 %tmp4, i32 0 676 %tmp20 = insertelement <2 x i8> %tmp19, i8 %tmp8, i32 1 677 store <2 x i8> %tmp20, <2 x i8>* %dst 678 %bit_cond = icmp eq i32 %tmp5, 0 679 call void @llvm.assume(i1 %bit_cond) 680 ret void 681} 682 683; Make sure we don't vectorize the loads below because the source of 684; sext instructions doesn't have the nsw flag or known bits allowing 685; to apply the vectorization. 686 687define void @ld_v4i8_add_not_safe(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) { 688; CHECK-LABEL: @ld_v4i8_add_not_safe( 689; CHECK-NEXT: bb: 690; CHECK-NEXT: [[TMP:%.*]] = add nsw i32 [[V0:%.*]], -1 691; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[V1:%.*]], [[TMP]] 692; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 693; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]] 694; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[TMP3]], align 1 695; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] 696; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 697; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP6]] 698; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP7]], align 1 699; CHECK-NEXT: [[TMP9:%.*]] = add nsw i32 [[V0]], 1 700; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[V1]], [[TMP9]] 701; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64 702; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP11]] 703; CHECK-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP12]], align 1 704; CHECK-NEXT: [[TMP14:%.*]] = add nsw i32 [[V0]], 2 705; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[V1]], [[TMP14]] 706; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP15]] to i64 707; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP16]] 708; CHECK-NEXT: [[TMP18:%.*]] = load i8, i8* [[TMP17]], align 1 709; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP4]], i32 0 710; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP8]], i32 1 711; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP13]], i32 2 712; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP18]], i32 3 713; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]] 714; CHECK-NEXT: ret void 715; 716bb: 717 %tmp = add nsw i32 %v0, -1 718 %tmp1 = add i32 %v1, %tmp 719 %tmp2 = sext i32 %tmp1 to i64 720 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 721 %tmp4 = load i8, i8* %tmp3, align 1 722 %tmp5 = add i32 %v1, %v0 723 %tmp6 = sext i32 %tmp5 to i64 724 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 725 %tmp8 = load i8, i8* %tmp7, align 1 726 %tmp9 = add nsw i32 %v0, 1 727 %tmp10 = add i32 %v1, %tmp9 728 %tmp11 = sext i32 %tmp10 to i64 729 %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11 730 %tmp13 = load i8, i8* %tmp12, align 1 731 %tmp14 = add nsw i32 %v0, 2 732 %tmp15 = add i32 %v1, %tmp14 733 %tmp16 = sext i32 %tmp15 to i64 734 %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16 735 %tmp18 = load i8, i8* %tmp17, align 1 736 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 737 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 738 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 739 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 740 store <4 x i8> %tmp22, <4 x i8>* %dst 741 ret void 742} 743