1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -mtriple=x86_64-apple-macosx10.11.0 -slp-vectorizer -S -mattr=+sse2 | FileCheck %s --check-prefix=SSE 3; RUN: opt < %s -mtriple=x86_64-apple-macosx10.11.0 -slp-vectorizer -S -mattr=+avx | FileCheck %s --check-prefix=AVX 4; RUN: opt < %s -mtriple=x86_64-apple-macosx10.11.0 -slp-vectorizer -S -mattr=+avx2 | FileCheck %s --check-prefix=AVX 5 6; Verify that the SLP vectorizer is able to figure out that commutativity 7; offers the possibility to splat/broadcast %c and thus make it profitable 8; to vectorize this case 9 10@cle = external unnamed_addr global [32 x i8], align 16 11@cle32 = external unnamed_addr global [32 x i32], align 16 12 13 14; Check that we correctly detect a splat/broadcast by leveraging the 15; commutativity property of `xor`. 16 17define void @splat(i8 %a, i8 %b, i8 %c) { 18; SSE-LABEL: @splat( 19; SSE-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i32 0 20; SSE-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[B:%.*]], i32 1 21; SSE-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> 22; SSE-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0 23; SSE-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> zeroinitializer 24; SSE-NEXT: [[TMP4:%.*]] = xor <16 x i8> [[SHUFFLE]], [[SHUFFLE1]] 25; SSE-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast ([32 x i8]* @cle to <16 x i8>*), align 16 26; SSE-NEXT: ret void 27; 28; AVX-LABEL: @splat( 29; AVX-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i32 0 30; AVX-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[B:%.*]], i32 1 31; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> 32; AVX-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0 33; AVX-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> zeroinitializer 34; AVX-NEXT: [[TMP4:%.*]] = xor <16 x i8> [[SHUFFLE]], [[SHUFFLE1]] 35; AVX-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast ([32 x i8]* @cle to <16 x i8>*), align 16 36; AVX-NEXT: ret void 37; 38 %1 = xor i8 %c, %a 39 store i8 %1, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 0), align 16 40 %2 = xor i8 %a, %c 41 store i8 %2, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 1) 42 %3 = xor i8 %a, %c 43 store i8 %3, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 2) 44 %4 = xor i8 %a, %c 45 store i8 %4, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 3) 46 %5 = xor i8 %c, %a 47 store i8 %5, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 4) 48 %6 = xor i8 %c, %b 49 store i8 %6, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 5) 50 %7 = xor i8 %c, %a 51 store i8 %7, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 6) 52 %8 = xor i8 %c, %b 53 store i8 %8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 7) 54 %9 = xor i8 %a, %c 55 store i8 %9, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 8) 56 %10 = xor i8 %a, %c 57 store i8 %10, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 9) 58 %11 = xor i8 %a, %c 59 store i8 %11, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 10) 60 %12 = xor i8 %a, %c 61 store i8 %12, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 11) 62 %13 = xor i8 %a, %c 63 store i8 %13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 12) 64 %14 = xor i8 %a, %c 65 store i8 %14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 13) 66 %15 = xor i8 %a, %c 67 store i8 %15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 14) 68 %16 = xor i8 %a, %c 69 store i8 %16, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 15) 70 ret void 71} 72 73; Check that we correctly detect that we can have the same opcode on one side by 74; leveraging the commutativity property of `xor`. 75 76define void @same_opcode_on_one_side(i32 %a, i32 %b, i32 %c) { 77; SSE-LABEL: @same_opcode_on_one_side( 78; SSE-NEXT: [[ADD1:%.*]] = add i32 [[C:%.*]], [[A:%.*]] 79; SSE-NEXT: [[ADD2:%.*]] = add i32 [[C]], [[A]] 80; SSE-NEXT: [[ADD3:%.*]] = add i32 [[A]], [[C]] 81; SSE-NEXT: [[ADD4:%.*]] = add i32 [[C]], [[A]] 82; SSE-NEXT: [[TMP1:%.*]] = xor i32 [[ADD1]], [[A]] 83; SSE-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 0), align 16 84; SSE-NEXT: [[TMP2:%.*]] = xor i32 [[B:%.*]], [[ADD2]] 85; SSE-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 1), align 4 86; SSE-NEXT: [[TMP3:%.*]] = xor i32 [[C]], [[ADD3]] 87; SSE-NEXT: store i32 [[TMP3]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 2), align 4 88; SSE-NEXT: [[TMP4:%.*]] = xor i32 [[A]], [[ADD4]] 89; SSE-NEXT: store i32 [[TMP4]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 3), align 4 90; SSE-NEXT: ret void 91; 92; AVX-LABEL: @same_opcode_on_one_side( 93; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0 94; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer 95; AVX-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i32 0 96; AVX-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer 97; AVX-NEXT: [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] 98; AVX-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[B:%.*]], i32 1 99; AVX-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[C]], i32 2 100; AVX-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[A]], i32 3 101; AVX-NEXT: [[TMP7:%.*]] = xor <4 x i32> [[TMP3]], [[TMP6]] 102; AVX-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast ([32 x i32]* @cle32 to <4 x i32>*), align 16 103; AVX-NEXT: ret void 104; 105 %add1 = add i32 %c, %a 106 %add2 = add i32 %c, %a 107 %add3 = add i32 %a, %c 108 %add4 = add i32 %c, %a 109 %1 = xor i32 %add1, %a 110 store i32 %1, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 0), align 16 111 %2 = xor i32 %b, %add2 112 store i32 %2, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 1) 113 %3 = xor i32 %c, %add3 114 store i32 %3, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 2) 115 %4 = xor i32 %a, %add4 116 store i32 %4, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 3) 117 ret void 118} 119