1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2 // RUN: %clang_cc1 -no-opaque-pointers %s -O0 -ffreestanding -triple=x86_64-unknown-unknown -target-feature +kl -target-feature +widekl -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=CHECK64
3 // RUN: %clang_cc1 -no-opaque-pointers %s -O0 -ffreestanding -triple=i386-unknown-unknown -target-feature +kl -target-feature +widekl -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=CHECK32
4 
5 #include <x86intrin.h>
6 
7 // CHECK64-LABEL: @test_loadiwkey(
8 // CHECK64-NEXT:  entry:
9 // CHECK64-NEXT:    [[__CTL_ADDR_I:%.*]] = alloca i32, align 4
10 // CHECK64-NEXT:    [[__INTKEY_ADDR_I:%.*]] = alloca <2 x i64>, align 16
11 // CHECK64-NEXT:    [[__ENKEY_LO_ADDR_I:%.*]] = alloca <2 x i64>, align 16
12 // CHECK64-NEXT:    [[__ENKEY_HI_ADDR_I:%.*]] = alloca <2 x i64>, align 16
13 // CHECK64-NEXT:    [[CTL_ADDR:%.*]] = alloca i32, align 4
14 // CHECK64-NEXT:    [[INTKEY_ADDR:%.*]] = alloca <2 x i64>, align 16
15 // CHECK64-NEXT:    [[ENKEY_LO_ADDR:%.*]] = alloca <2 x i64>, align 16
16 // CHECK64-NEXT:    [[ENKEY_HI_ADDR:%.*]] = alloca <2 x i64>, align 16
17 // CHECK64-NEXT:    store i32 [[CTL:%.*]], i32* [[CTL_ADDR]], align 4
18 // CHECK64-NEXT:    store <2 x i64> [[INTKEY:%.*]], <2 x i64>* [[INTKEY_ADDR]], align 16
19 // CHECK64-NEXT:    store <2 x i64> [[ENKEY_LO:%.*]], <2 x i64>* [[ENKEY_LO_ADDR]], align 16
20 // CHECK64-NEXT:    store <2 x i64> [[ENKEY_HI:%.*]], <2 x i64>* [[ENKEY_HI_ADDR]], align 16
21 // CHECK64-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CTL_ADDR]], align 4
22 // CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[INTKEY_ADDR]], align 16
23 // CHECK64-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[ENKEY_LO_ADDR]], align 16
24 // CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ENKEY_HI_ADDR]], align 16
25 // CHECK64-NEXT:    store i32 [[TMP0]], i32* [[__CTL_ADDR_I]], align 4
26 // CHECK64-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__INTKEY_ADDR_I]], align 16
27 // CHECK64-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* [[__ENKEY_LO_ADDR_I]], align 16
28 // CHECK64-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* [[__ENKEY_HI_ADDR_I]], align 16
29 // CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__INTKEY_ADDR_I]], align 16
30 // CHECK64-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[__ENKEY_LO_ADDR_I]], align 16
31 // CHECK64-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[__ENKEY_HI_ADDR_I]], align 16
32 // CHECK64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[__CTL_ADDR_I]], align 4
33 // CHECK64-NEXT:    call void @llvm.x86.loadiwkey(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], i32 [[TMP7]])
34 // CHECK64-NEXT:    ret void
35 //
36 // CHECK32-LABEL: @test_loadiwkey(
37 // CHECK32-NEXT:  entry:
38 // CHECK32-NEXT:    [[__CTL_ADDR_I:%.*]] = alloca i32, align 4
39 // CHECK32-NEXT:    [[__INTKEY_ADDR_I:%.*]] = alloca <2 x i64>, align 16
40 // CHECK32-NEXT:    [[__ENKEY_LO_ADDR_I:%.*]] = alloca <2 x i64>, align 16
41 // CHECK32-NEXT:    [[__ENKEY_HI_ADDR_I:%.*]] = alloca <2 x i64>, align 16
42 // CHECK32-NEXT:    [[CTL_ADDR:%.*]] = alloca i32, align 4
43 // CHECK32-NEXT:    [[INTKEY_ADDR:%.*]] = alloca <2 x i64>, align 16
44 // CHECK32-NEXT:    [[ENKEY_LO_ADDR:%.*]] = alloca <2 x i64>, align 16
45 // CHECK32-NEXT:    [[ENKEY_HI_ADDR:%.*]] = alloca <2 x i64>, align 16
46 // CHECK32-NEXT:    store i32 [[CTL:%.*]], i32* [[CTL_ADDR]], align 4
47 // CHECK32-NEXT:    store <2 x i64> [[INTKEY:%.*]], <2 x i64>* [[INTKEY_ADDR]], align 16
48 // CHECK32-NEXT:    store <2 x i64> [[ENKEY_LO:%.*]], <2 x i64>* [[ENKEY_LO_ADDR]], align 16
49 // CHECK32-NEXT:    store <2 x i64> [[ENKEY_HI:%.*]], <2 x i64>* [[ENKEY_HI_ADDR]], align 16
50 // CHECK32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CTL_ADDR]], align 4
51 // CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[INTKEY_ADDR]], align 16
52 // CHECK32-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[ENKEY_LO_ADDR]], align 16
53 // CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ENKEY_HI_ADDR]], align 16
54 // CHECK32-NEXT:    store i32 [[TMP0]], i32* [[__CTL_ADDR_I]], align 4
55 // CHECK32-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__INTKEY_ADDR_I]], align 16
56 // CHECK32-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* [[__ENKEY_LO_ADDR_I]], align 16
57 // CHECK32-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* [[__ENKEY_HI_ADDR_I]], align 16
58 // CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__INTKEY_ADDR_I]], align 16
59 // CHECK32-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[__ENKEY_LO_ADDR_I]], align 16
60 // CHECK32-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[__ENKEY_HI_ADDR_I]], align 16
61 // CHECK32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[__CTL_ADDR_I]], align 4
62 // CHECK32-NEXT:    call void @llvm.x86.loadiwkey(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], i32 [[TMP7]])
63 // CHECK32-NEXT:    ret void
64 //
test_loadiwkey(unsigned int ctl,__m128i intkey,__m128i enkey_lo,__m128i enkey_hi)65 void test_loadiwkey(unsigned int ctl, __m128i intkey, __m128i enkey_lo, __m128i enkey_hi) {
66   _mm_loadiwkey(ctl, intkey, enkey_lo, enkey_hi);
67 }
68 
69 // CHECK64-LABEL: @test_encodekey128_u32(
70 // CHECK64-NEXT:  entry:
71 // CHECK64-NEXT:    [[__HTYPE_ADDR_I:%.*]] = alloca i32, align 4
72 // CHECK64-NEXT:    [[__KEY_ADDR_I:%.*]] = alloca <2 x i64>, align 16
73 // CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
74 // CHECK64-NEXT:    [[HTYPE_ADDR:%.*]] = alloca i32, align 4
75 // CHECK64-NEXT:    [[KEY_ADDR:%.*]] = alloca <2 x i64>, align 16
76 // CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
77 // CHECK64-NEXT:    store i32 [[HTYPE:%.*]], i32* [[HTYPE_ADDR]], align 4
78 // CHECK64-NEXT:    store <2 x i64> [[KEY:%.*]], <2 x i64>* [[KEY_ADDR]], align 16
79 // CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
80 // CHECK64-NEXT:    [[TMP0:%.*]] = load i32, i32* [[HTYPE_ADDR]], align 4
81 // CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[KEY_ADDR]], align 16
82 // CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
83 // CHECK64-NEXT:    store i32 [[TMP0]], i32* [[__HTYPE_ADDR_I]], align 4
84 // CHECK64-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__KEY_ADDR_I]], align 16
85 // CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
86 // CHECK64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[__HTYPE_ADDR_I]], align 4
87 // CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_ADDR_I]], align 16
88 // CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
89 // CHECK64-NEXT:    [[TMP6:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32 [[TMP3]], <2 x i64> [[TMP4]])
90 // CHECK64-NEXT:    [[TMP7:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 1
91 // CHECK64-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP5]] to <2 x i64>*
92 // CHECK64-NEXT:    store <2 x i64> [[TMP7]], <2 x i64>* [[TMP8]], align 1
93 // CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 2
94 // CHECK64-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[TMP5]], i32 16
95 // CHECK64-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <2 x i64>*
96 // CHECK64-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP11]], align 1
97 // CHECK64-NEXT:    [[TMP12:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 3
98 // CHECK64-NEXT:    [[TMP13:%.*]] = getelementptr i8, i8* [[TMP5]], i32 32
99 // CHECK64-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <2 x i64>*
100 // CHECK64-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* [[TMP14]], align 1
101 // CHECK64-NEXT:    [[TMP15:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 0
102 // CHECK64-NEXT:    ret i32 [[TMP15]]
103 //
104 // CHECK32-LABEL: @test_encodekey128_u32(
105 // CHECK32-NEXT:  entry:
106 // CHECK32-NEXT:    [[__HTYPE_ADDR_I:%.*]] = alloca i32, align 4
107 // CHECK32-NEXT:    [[__KEY_ADDR_I:%.*]] = alloca <2 x i64>, align 16
108 // CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
109 // CHECK32-NEXT:    [[HTYPE_ADDR:%.*]] = alloca i32, align 4
110 // CHECK32-NEXT:    [[KEY_ADDR:%.*]] = alloca <2 x i64>, align 16
111 // CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
112 // CHECK32-NEXT:    store i32 [[HTYPE:%.*]], i32* [[HTYPE_ADDR]], align 4
113 // CHECK32-NEXT:    store <2 x i64> [[KEY:%.*]], <2 x i64>* [[KEY_ADDR]], align 16
114 // CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
115 // CHECK32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[HTYPE_ADDR]], align 4
116 // CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[KEY_ADDR]], align 16
117 // CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
118 // CHECK32-NEXT:    store i32 [[TMP0]], i32* [[__HTYPE_ADDR_I]], align 4
119 // CHECK32-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__KEY_ADDR_I]], align 16
120 // CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
121 // CHECK32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[__HTYPE_ADDR_I]], align 4
122 // CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_ADDR_I]], align 16
123 // CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
124 // CHECK32-NEXT:    [[TMP6:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32 [[TMP3]], <2 x i64> [[TMP4]])
125 // CHECK32-NEXT:    [[TMP7:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 1
126 // CHECK32-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP5]] to <2 x i64>*
127 // CHECK32-NEXT:    store <2 x i64> [[TMP7]], <2 x i64>* [[TMP8]], align 1
128 // CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 2
129 // CHECK32-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[TMP5]], i32 16
130 // CHECK32-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <2 x i64>*
131 // CHECK32-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP11]], align 1
132 // CHECK32-NEXT:    [[TMP12:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 3
133 // CHECK32-NEXT:    [[TMP13:%.*]] = getelementptr i8, i8* [[TMP5]], i32 32
134 // CHECK32-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <2 x i64>*
135 // CHECK32-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* [[TMP14]], align 1
136 // CHECK32-NEXT:    [[TMP15:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 0
137 // CHECK32-NEXT:    ret i32 [[TMP15]]
138 //
test_encodekey128_u32(unsigned int htype,__m128i key,void * h)139 unsigned int test_encodekey128_u32(unsigned int htype, __m128i key, void *h) {
140   return _mm_encodekey128_u32(htype, key, h);
141 }
142 
143 // CHECK64-LABEL: @test_encodekey256_u32(
144 // CHECK64-NEXT:  entry:
145 // CHECK64-NEXT:    [[__HTYPE_ADDR_I:%.*]] = alloca i32, align 4
146 // CHECK64-NEXT:    [[__KEY_LO_ADDR_I:%.*]] = alloca <2 x i64>, align 16
147 // CHECK64-NEXT:    [[__KEY_HI_ADDR_I:%.*]] = alloca <2 x i64>, align 16
148 // CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
149 // CHECK64-NEXT:    [[HTYPE_ADDR:%.*]] = alloca i32, align 4
150 // CHECK64-NEXT:    [[KEY_LO_ADDR:%.*]] = alloca <2 x i64>, align 16
151 // CHECK64-NEXT:    [[KEY_HI_ADDR:%.*]] = alloca <2 x i64>, align 16
152 // CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
153 // CHECK64-NEXT:    store i32 [[HTYPE:%.*]], i32* [[HTYPE_ADDR]], align 4
154 // CHECK64-NEXT:    store <2 x i64> [[KEY_LO:%.*]], <2 x i64>* [[KEY_LO_ADDR]], align 16
155 // CHECK64-NEXT:    store <2 x i64> [[KEY_HI:%.*]], <2 x i64>* [[KEY_HI_ADDR]], align 16
156 // CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
157 // CHECK64-NEXT:    [[TMP0:%.*]] = load i32, i32* [[HTYPE_ADDR]], align 4
158 // CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[KEY_LO_ADDR]], align 16
159 // CHECK64-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[KEY_HI_ADDR]], align 16
160 // CHECK64-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[H_ADDR]], align 8
161 // CHECK64-NEXT:    store i32 [[TMP0]], i32* [[__HTYPE_ADDR_I]], align 4
162 // CHECK64-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__KEY_LO_ADDR_I]], align 16
163 // CHECK64-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* [[__KEY_HI_ADDR_I]], align 16
164 // CHECK64-NEXT:    store i8* [[TMP3]], i8** [[__H_ADDR_I]], align 8
165 // CHECK64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[__HTYPE_ADDR_I]], align 4
166 // CHECK64-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_LO_ADDR_I]], align 16
167 // CHECK64-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_HI_ADDR_I]], align 16
168 // CHECK64-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
169 // CHECK64-NEXT:    [[TMP8:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32 [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]])
170 // CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 1
171 // CHECK64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP7]] to <2 x i64>*
172 // CHECK64-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP10]], align 1
173 // CHECK64-NEXT:    [[TMP11:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 2
174 // CHECK64-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[TMP7]], i32 16
175 // CHECK64-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <2 x i64>*
176 // CHECK64-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* [[TMP13]], align 1
177 // CHECK64-NEXT:    [[TMP14:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 3
178 // CHECK64-NEXT:    [[TMP15:%.*]] = getelementptr i8, i8* [[TMP7]], i32 32
179 // CHECK64-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP15]] to <2 x i64>*
180 // CHECK64-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* [[TMP16]], align 1
181 // CHECK64-NEXT:    [[TMP17:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 4
182 // CHECK64-NEXT:    [[TMP18:%.*]] = getelementptr i8, i8* [[TMP7]], i32 48
183 // CHECK64-NEXT:    [[TMP19:%.*]] = bitcast i8* [[TMP18]] to <2 x i64>*
184 // CHECK64-NEXT:    store <2 x i64> [[TMP17]], <2 x i64>* [[TMP19]], align 1
185 // CHECK64-NEXT:    [[TMP20:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 0
186 // CHECK64-NEXT:    ret i32 [[TMP20]]
187 //
188 // CHECK32-LABEL: @test_encodekey256_u32(
189 // CHECK32-NEXT:  entry:
190 // CHECK32-NEXT:    [[__HTYPE_ADDR_I:%.*]] = alloca i32, align 4
191 // CHECK32-NEXT:    [[__KEY_LO_ADDR_I:%.*]] = alloca <2 x i64>, align 16
192 // CHECK32-NEXT:    [[__KEY_HI_ADDR_I:%.*]] = alloca <2 x i64>, align 16
193 // CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
194 // CHECK32-NEXT:    [[HTYPE_ADDR:%.*]] = alloca i32, align 4
195 // CHECK32-NEXT:    [[KEY_LO_ADDR:%.*]] = alloca <2 x i64>, align 16
196 // CHECK32-NEXT:    [[KEY_HI_ADDR:%.*]] = alloca <2 x i64>, align 16
197 // CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
198 // CHECK32-NEXT:    store i32 [[HTYPE:%.*]], i32* [[HTYPE_ADDR]], align 4
199 // CHECK32-NEXT:    store <2 x i64> [[KEY_LO:%.*]], <2 x i64>* [[KEY_LO_ADDR]], align 16
200 // CHECK32-NEXT:    store <2 x i64> [[KEY_HI:%.*]], <2 x i64>* [[KEY_HI_ADDR]], align 16
201 // CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
202 // CHECK32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[HTYPE_ADDR]], align 4
203 // CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[KEY_LO_ADDR]], align 16
204 // CHECK32-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[KEY_HI_ADDR]], align 16
205 // CHECK32-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[H_ADDR]], align 4
206 // CHECK32-NEXT:    store i32 [[TMP0]], i32* [[__HTYPE_ADDR_I]], align 4
207 // CHECK32-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__KEY_LO_ADDR_I]], align 16
208 // CHECK32-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* [[__KEY_HI_ADDR_I]], align 16
209 // CHECK32-NEXT:    store i8* [[TMP3]], i8** [[__H_ADDR_I]], align 4
210 // CHECK32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[__HTYPE_ADDR_I]], align 4
211 // CHECK32-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_LO_ADDR_I]], align 16
212 // CHECK32-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_HI_ADDR_I]], align 16
213 // CHECK32-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
214 // CHECK32-NEXT:    [[TMP8:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32 [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]])
215 // CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 1
216 // CHECK32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP7]] to <2 x i64>*
217 // CHECK32-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP10]], align 1
218 // CHECK32-NEXT:    [[TMP11:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 2
219 // CHECK32-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[TMP7]], i32 16
220 // CHECK32-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <2 x i64>*
221 // CHECK32-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* [[TMP13]], align 1
222 // CHECK32-NEXT:    [[TMP14:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 3
223 // CHECK32-NEXT:    [[TMP15:%.*]] = getelementptr i8, i8* [[TMP7]], i32 32
224 // CHECK32-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP15]] to <2 x i64>*
225 // CHECK32-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* [[TMP16]], align 1
226 // CHECK32-NEXT:    [[TMP17:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 4
227 // CHECK32-NEXT:    [[TMP18:%.*]] = getelementptr i8, i8* [[TMP7]], i32 48
228 // CHECK32-NEXT:    [[TMP19:%.*]] = bitcast i8* [[TMP18]] to <2 x i64>*
229 // CHECK32-NEXT:    store <2 x i64> [[TMP17]], <2 x i64>* [[TMP19]], align 1
230 // CHECK32-NEXT:    [[TMP20:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 0
231 // CHECK32-NEXT:    ret i32 [[TMP20]]
232 //
test_encodekey256_u32(unsigned int htype,__m128i key_lo,__m128i key_hi,void * h)233 unsigned int test_encodekey256_u32(unsigned int htype, __m128i key_lo, __m128i key_hi, void *h) {
234   return _mm_encodekey256_u32(htype, key_lo, key_hi, h);
235 }
236 
237 // CHECK64-LABEL: @test_mm_aesenc256kl_u8(
238 // CHECK64-NEXT:  entry:
239 // CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
240 // CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
241 // CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
242 // CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
243 // CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
244 // CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
245 // CHECK64-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 8
246 // CHECK64-NEXT:    store <2 x i64> [[IDATA:%.*]], <2 x i64>* [[IDATA_ADDR]], align 16
247 // CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
248 // CHECK64-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 8
249 // CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[IDATA_ADDR]], align 16
250 // CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
251 // CHECK64-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 8
252 // CHECK64-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__IDATA_ADDR_I]], align 16
253 // CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
254 // CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
255 // CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
256 // CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
257 // CHECK64-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> [[TMP4]], i8* [[TMP5]])
258 // CHECK64-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
259 // CHECK64-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
260 // CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
261 // CHECK64-NEXT:    br i1 [[TMP8]], label [[AESENC256KL_NO_ERROR_I:%.*]], label [[AESENC256KL_ERROR_I:%.*]]
262 // CHECK64:       aesenc256kl_no_error.i:
263 // CHECK64-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP3]], align 16
264 // CHECK64-NEXT:    br label [[_MM_AESENC256KL_U8_EXIT:%.*]]
265 // CHECK64:       aesenc256kl_error.i:
266 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
267 // CHECK64-NEXT:    br label [[_MM_AESENC256KL_U8_EXIT]]
268 // CHECK64:       _mm_aesenc256kl_u8.exit:
269 // CHECK64-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
270 // CHECK64-NEXT:    ret i8 [[TMP10]]
271 //
272 // CHECK32-LABEL: @test_mm_aesenc256kl_u8(
273 // CHECK32-NEXT:  entry:
274 // CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
275 // CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
276 // CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
277 // CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
278 // CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
279 // CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
280 // CHECK32-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 4
281 // CHECK32-NEXT:    store <2 x i64> [[IDATA:%.*]], <2 x i64>* [[IDATA_ADDR]], align 16
282 // CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
283 // CHECK32-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 4
284 // CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[IDATA_ADDR]], align 16
285 // CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
286 // CHECK32-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 4
287 // CHECK32-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__IDATA_ADDR_I]], align 16
288 // CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
289 // CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
290 // CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
291 // CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
292 // CHECK32-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> [[TMP4]], i8* [[TMP5]])
293 // CHECK32-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
294 // CHECK32-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
295 // CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
296 // CHECK32-NEXT:    br i1 [[TMP8]], label [[AESENC256KL_NO_ERROR_I:%.*]], label [[AESENC256KL_ERROR_I:%.*]]
297 // CHECK32:       aesenc256kl_no_error.i:
298 // CHECK32-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP3]], align 16
299 // CHECK32-NEXT:    br label [[_MM_AESENC256KL_U8_EXIT:%.*]]
300 // CHECK32:       aesenc256kl_error.i:
301 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
302 // CHECK32-NEXT:    br label [[_MM_AESENC256KL_U8_EXIT]]
303 // CHECK32:       _mm_aesenc256kl_u8.exit:
304 // CHECK32-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
305 // CHECK32-NEXT:    ret i8 [[TMP10]]
306 //
test_mm_aesenc256kl_u8(__m128i * odata,__m128i idata,const void * h)307 unsigned char test_mm_aesenc256kl_u8(__m128i *odata, __m128i idata, const void *h) {
308   return _mm_aesenc256kl_u8(odata, idata, h);
309 }
310 
311 // CHECK64-LABEL: @test_mm_aesdec256kl_u8(
312 // CHECK64-NEXT:  entry:
313 // CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
314 // CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
315 // CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
316 // CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
317 // CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
318 // CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
319 // CHECK64-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 8
320 // CHECK64-NEXT:    store <2 x i64> [[IDATA:%.*]], <2 x i64>* [[IDATA_ADDR]], align 16
321 // CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
322 // CHECK64-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 8
323 // CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[IDATA_ADDR]], align 16
324 // CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
325 // CHECK64-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 8
326 // CHECK64-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__IDATA_ADDR_I]], align 16
327 // CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
328 // CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
329 // CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
330 // CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
331 // CHECK64-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> [[TMP4]], i8* [[TMP5]])
332 // CHECK64-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
333 // CHECK64-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
334 // CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
335 // CHECK64-NEXT:    br i1 [[TMP8]], label [[AESDEC256KL_NO_ERROR_I:%.*]], label [[AESDEC256KL_ERROR_I:%.*]]
336 // CHECK64:       aesdec256kl_no_error.i:
337 // CHECK64-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP3]], align 16
338 // CHECK64-NEXT:    br label [[_MM_AESDEC256KL_U8_EXIT:%.*]]
339 // CHECK64:       aesdec256kl_error.i:
340 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
341 // CHECK64-NEXT:    br label [[_MM_AESDEC256KL_U8_EXIT]]
342 // CHECK64:       _mm_aesdec256kl_u8.exit:
343 // CHECK64-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
344 // CHECK64-NEXT:    ret i8 [[TMP10]]
345 //
346 // CHECK32-LABEL: @test_mm_aesdec256kl_u8(
347 // CHECK32-NEXT:  entry:
348 // CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
349 // CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
350 // CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
351 // CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
352 // CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
353 // CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
354 // CHECK32-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 4
355 // CHECK32-NEXT:    store <2 x i64> [[IDATA:%.*]], <2 x i64>* [[IDATA_ADDR]], align 16
356 // CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
357 // CHECK32-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 4
358 // CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[IDATA_ADDR]], align 16
359 // CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
360 // CHECK32-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 4
361 // CHECK32-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__IDATA_ADDR_I]], align 16
362 // CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
363 // CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
364 // CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
365 // CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
366 // CHECK32-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> [[TMP4]], i8* [[TMP5]])
367 // CHECK32-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
368 // CHECK32-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
369 // CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
370 // CHECK32-NEXT:    br i1 [[TMP8]], label [[AESDEC256KL_NO_ERROR_I:%.*]], label [[AESDEC256KL_ERROR_I:%.*]]
371 // CHECK32:       aesdec256kl_no_error.i:
372 // CHECK32-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP3]], align 16
373 // CHECK32-NEXT:    br label [[_MM_AESDEC256KL_U8_EXIT:%.*]]
374 // CHECK32:       aesdec256kl_error.i:
375 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
376 // CHECK32-NEXT:    br label [[_MM_AESDEC256KL_U8_EXIT]]
377 // CHECK32:       _mm_aesdec256kl_u8.exit:
378 // CHECK32-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
379 // CHECK32-NEXT:    ret i8 [[TMP10]]
380 //
test_mm_aesdec256kl_u8(__m128i * odata,__m128i idata,const void * h)381 unsigned char test_mm_aesdec256kl_u8(__m128i *odata, __m128i idata, const void *h) {
382   return _mm_aesdec256kl_u8(odata, idata, h);
383 }
384 
385 // CHECK64-LABEL: @test_mm_aesenc128kl_u8(
386 // CHECK64-NEXT:  entry:
387 // CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
388 // CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
389 // CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
390 // CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
391 // CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
392 // CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
393 // CHECK64-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 8
394 // CHECK64-NEXT:    store <2 x i64> [[IDATA:%.*]], <2 x i64>* [[IDATA_ADDR]], align 16
395 // CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
396 // CHECK64-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 8
397 // CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[IDATA_ADDR]], align 16
398 // CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
399 // CHECK64-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 8
400 // CHECK64-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__IDATA_ADDR_I]], align 16
401 // CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
402 // CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
403 // CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
404 // CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
405 // CHECK64-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> [[TMP4]], i8* [[TMP5]])
406 // CHECK64-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
407 // CHECK64-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
408 // CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
409 // CHECK64-NEXT:    br i1 [[TMP8]], label [[AESENC128KL_NO_ERROR_I:%.*]], label [[AESENC128KL_ERROR_I:%.*]]
410 // CHECK64:       aesenc128kl_no_error.i:
411 // CHECK64-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP3]], align 16
412 // CHECK64-NEXT:    br label [[_MM_AESENC128KL_U8_EXIT:%.*]]
413 // CHECK64:       aesenc128kl_error.i:
414 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
415 // CHECK64-NEXT:    br label [[_MM_AESENC128KL_U8_EXIT]]
416 // CHECK64:       _mm_aesenc128kl_u8.exit:
417 // CHECK64-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
418 // CHECK64-NEXT:    ret i8 [[TMP10]]
419 //
420 // CHECK32-LABEL: @test_mm_aesenc128kl_u8(
421 // CHECK32-NEXT:  entry:
422 // CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
423 // CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
424 // CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
425 // CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
426 // CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
427 // CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
428 // CHECK32-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 4
429 // CHECK32-NEXT:    store <2 x i64> [[IDATA:%.*]], <2 x i64>* [[IDATA_ADDR]], align 16
430 // CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
431 // CHECK32-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 4
432 // CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[IDATA_ADDR]], align 16
433 // CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
434 // CHECK32-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 4
435 // CHECK32-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__IDATA_ADDR_I]], align 16
436 // CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
437 // CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
438 // CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
439 // CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
440 // CHECK32-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> [[TMP4]], i8* [[TMP5]])
441 // CHECK32-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
442 // CHECK32-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
443 // CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
444 // CHECK32-NEXT:    br i1 [[TMP8]], label [[AESENC128KL_NO_ERROR_I:%.*]], label [[AESENC128KL_ERROR_I:%.*]]
445 // CHECK32:       aesenc128kl_no_error.i:
446 // CHECK32-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP3]], align 16
447 // CHECK32-NEXT:    br label [[_MM_AESENC128KL_U8_EXIT:%.*]]
448 // CHECK32:       aesenc128kl_error.i:
449 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
450 // CHECK32-NEXT:    br label [[_MM_AESENC128KL_U8_EXIT]]
451 // CHECK32:       _mm_aesenc128kl_u8.exit:
452 // CHECK32-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
453 // CHECK32-NEXT:    ret i8 [[TMP10]]
454 //
test_mm_aesenc128kl_u8(__m128i * odata,__m128i idata,const void * h)455 unsigned char test_mm_aesenc128kl_u8(__m128i *odata, __m128i idata, const void *h) {
456   return _mm_aesenc128kl_u8(odata, idata, h);
457 }
458 
459 // CHECK64-LABEL: @test_mm_aesdec128kl_u8(
460 // CHECK64-NEXT:  entry:
461 // CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
462 // CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
463 // CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
464 // CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
465 // CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
466 // CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
467 // CHECK64-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 8
468 // CHECK64-NEXT:    store <2 x i64> [[IDATA:%.*]], <2 x i64>* [[IDATA_ADDR]], align 16
469 // CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
470 // CHECK64-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 8
471 // CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[IDATA_ADDR]], align 16
472 // CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
473 // CHECK64-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 8
474 // CHECK64-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__IDATA_ADDR_I]], align 16
475 // CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
476 // CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
477 // CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
478 // CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
479 // CHECK64-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> [[TMP4]], i8* [[TMP5]])
480 // CHECK64-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
481 // CHECK64-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
482 // CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
483 // CHECK64-NEXT:    br i1 [[TMP8]], label [[AESDEC128KL_NO_ERROR_I:%.*]], label [[AESDEC128KL_ERROR_I:%.*]]
484 // CHECK64:       aesdec128kl_no_error.i:
485 // CHECK64-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP3]], align 16
486 // CHECK64-NEXT:    br label [[_MM_AESDEC128KL_U8_EXIT:%.*]]
487 // CHECK64:       aesdec128kl_error.i:
488 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
489 // CHECK64-NEXT:    br label [[_MM_AESDEC128KL_U8_EXIT]]
490 // CHECK64:       _mm_aesdec128kl_u8.exit:
491 // CHECK64-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
492 // CHECK64-NEXT:    ret i8 [[TMP10]]
493 //
494 // CHECK32-LABEL: @test_mm_aesdec128kl_u8(
495 // CHECK32-NEXT:  entry:
496 // CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
497 // CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
498 // CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
499 // CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
500 // CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
501 // CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
502 // CHECK32-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 4
503 // CHECK32-NEXT:    store <2 x i64> [[IDATA:%.*]], <2 x i64>* [[IDATA_ADDR]], align 16
504 // CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
505 // CHECK32-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 4
506 // CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[IDATA_ADDR]], align 16
507 // CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
508 // CHECK32-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 4
509 // CHECK32-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__IDATA_ADDR_I]], align 16
510 // CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
511 // CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
512 // CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
513 // CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
514 // CHECK32-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> [[TMP4]], i8* [[TMP5]])
515 // CHECK32-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
516 // CHECK32-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
517 // CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
518 // CHECK32-NEXT:    br i1 [[TMP8]], label [[AESDEC128KL_NO_ERROR_I:%.*]], label [[AESDEC128KL_ERROR_I:%.*]]
519 // CHECK32:       aesdec128kl_no_error.i:
520 // CHECK32-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP3]], align 16
521 // CHECK32-NEXT:    br label [[_MM_AESDEC128KL_U8_EXIT:%.*]]
522 // CHECK32:       aesdec128kl_error.i:
523 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
524 // CHECK32-NEXT:    br label [[_MM_AESDEC128KL_U8_EXIT]]
525 // CHECK32:       _mm_aesdec128kl_u8.exit:
526 // CHECK32-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
527 // CHECK32-NEXT:    ret i8 [[TMP10]]
528 //
test_mm_aesdec128kl_u8(__m128i * odata,__m128i idata,const void * h)529 unsigned char test_mm_aesdec128kl_u8(__m128i *odata, __m128i idata, const void *h) {
530   return _mm_aesdec128kl_u8(odata, idata, h);
531 }
532 
533 // CHECK64-LABEL: @test__mm_aesencwide128kl_u8(
534 // CHECK64-NEXT:  entry:
535 // CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
536 // CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
537 // CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
538 // CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
539 // CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
540 // CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
541 // CHECK64-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 8
542 // CHECK64-NEXT:    store <2 x i64>* [[IDATA:%.*]], <2 x i64>** [[IDATA_ADDR]], align 8
543 // CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
544 // CHECK64-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 8
545 // CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>*, <2 x i64>** [[IDATA_ADDR]], align 8
546 // CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
547 // CHECK64-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 8
548 // CHECK64-NEXT:    store <2 x i64>* [[TMP1]], <2 x i64>** [[__IDATA_ADDR_I]], align 8
549 // CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
550 // CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
551 // CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>*, <2 x i64>** [[__IDATA_ADDR_I]], align 8
552 // CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
553 // CHECK64-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 16
554 // CHECK64-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 1
555 // CHECK64-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 16
556 // CHECK64-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 2
557 // CHECK64-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 16
558 // CHECK64-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 3
559 // CHECK64-NEXT:    [[TMP12:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 16
560 // CHECK64-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 4
561 // CHECK64-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 16
562 // CHECK64-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 5
563 // CHECK64-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 16
564 // CHECK64-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 6
565 // CHECK64-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
566 // CHECK64-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
567 // CHECK64-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
568 // CHECK64-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
569 // CHECK64-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
570 // CHECK64-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
571 // CHECK64-NEXT:    br i1 [[TMP23]], label [[AESENCWIDE128KL_NO_ERROR_I:%.*]], label [[AESENCWIDE128KL_ERROR_I:%.*]]
572 // CHECK64:       aesencwide128kl_no_error.i:
573 // CHECK64-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
574 // CHECK64-NEXT:    store <2 x i64> [[TMP24]], <2 x i64>* [[TMP3]], align 16
575 // CHECK64-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
576 // CHECK64-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
577 // CHECK64-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[TMP26]], align 16
578 // CHECK64-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
579 // CHECK64-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
580 // CHECK64-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[TMP28]], align 16
581 // CHECK64-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
582 // CHECK64-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
583 // CHECK64-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[TMP30]], align 16
584 // CHECK64-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
585 // CHECK64-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
586 // CHECK64-NEXT:    store <2 x i64> [[TMP31]], <2 x i64>* [[TMP32]], align 16
587 // CHECK64-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
588 // CHECK64-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
589 // CHECK64-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[TMP34]], align 16
590 // CHECK64-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
591 // CHECK64-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
592 // CHECK64-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* [[TMP36]], align 16
593 // CHECK64-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
594 // CHECK64-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
595 // CHECK64-NEXT:    store <2 x i64> [[TMP37]], <2 x i64>* [[TMP38]], align 16
596 // CHECK64-NEXT:    br label [[_MM_AESENCWIDE128KL_U8_EXIT:%.*]]
597 // CHECK64:       aesencwide128kl_error.i:
598 // CHECK64-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
599 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
600 // CHECK64-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
601 // CHECK64-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
602 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP41]], align 16
603 // CHECK64-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
604 // CHECK64-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
605 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP43]], align 16
606 // CHECK64-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
607 // CHECK64-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
608 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP45]], align 16
609 // CHECK64-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
610 // CHECK64-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
611 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP47]], align 16
612 // CHECK64-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
613 // CHECK64-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
614 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP49]], align 16
615 // CHECK64-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
616 // CHECK64-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
617 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP51]], align 16
618 // CHECK64-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
619 // CHECK64-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
620 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP53]], align 16
621 // CHECK64-NEXT:    br label [[_MM_AESENCWIDE128KL_U8_EXIT]]
622 // CHECK64:       _mm_aesencwide128kl_u8.exit:
623 // CHECK64-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
624 // CHECK64-NEXT:    ret i8 [[TMP54]]
625 //
626 // CHECK32-LABEL: @test__mm_aesencwide128kl_u8(
627 // CHECK32-NEXT:  entry:
628 // CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
629 // CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
630 // CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
631 // CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
632 // CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
633 // CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
634 // CHECK32-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 4
635 // CHECK32-NEXT:    store <2 x i64>* [[IDATA:%.*]], <2 x i64>** [[IDATA_ADDR]], align 4
636 // CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
637 // CHECK32-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 4
638 // CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>*, <2 x i64>** [[IDATA_ADDR]], align 4
639 // CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
640 // CHECK32-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 4
641 // CHECK32-NEXT:    store <2 x i64>* [[TMP1]], <2 x i64>** [[__IDATA_ADDR_I]], align 4
642 // CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
643 // CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
644 // CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>*, <2 x i64>** [[__IDATA_ADDR_I]], align 4
645 // CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
646 // CHECK32-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 16
647 // CHECK32-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 1
648 // CHECK32-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 16
649 // CHECK32-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 2
650 // CHECK32-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 16
651 // CHECK32-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 3
652 // CHECK32-NEXT:    [[TMP12:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 16
653 // CHECK32-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 4
654 // CHECK32-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 16
655 // CHECK32-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 5
656 // CHECK32-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 16
657 // CHECK32-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 6
658 // CHECK32-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
659 // CHECK32-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
660 // CHECK32-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
661 // CHECK32-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
662 // CHECK32-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
663 // CHECK32-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
664 // CHECK32-NEXT:    br i1 [[TMP23]], label [[AESENCWIDE128KL_NO_ERROR_I:%.*]], label [[AESENCWIDE128KL_ERROR_I:%.*]]
665 // CHECK32:       aesencwide128kl_no_error.i:
666 // CHECK32-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
667 // CHECK32-NEXT:    store <2 x i64> [[TMP24]], <2 x i64>* [[TMP3]], align 16
668 // CHECK32-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
669 // CHECK32-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
670 // CHECK32-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[TMP26]], align 16
671 // CHECK32-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
672 // CHECK32-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
673 // CHECK32-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[TMP28]], align 16
674 // CHECK32-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
675 // CHECK32-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
676 // CHECK32-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[TMP30]], align 16
677 // CHECK32-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
678 // CHECK32-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
679 // CHECK32-NEXT:    store <2 x i64> [[TMP31]], <2 x i64>* [[TMP32]], align 16
680 // CHECK32-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
681 // CHECK32-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
682 // CHECK32-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[TMP34]], align 16
683 // CHECK32-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
684 // CHECK32-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
685 // CHECK32-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* [[TMP36]], align 16
686 // CHECK32-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
687 // CHECK32-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
688 // CHECK32-NEXT:    store <2 x i64> [[TMP37]], <2 x i64>* [[TMP38]], align 16
689 // CHECK32-NEXT:    br label [[_MM_AESENCWIDE128KL_U8_EXIT:%.*]]
690 // CHECK32:       aesencwide128kl_error.i:
691 // CHECK32-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
692 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
693 // CHECK32-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
694 // CHECK32-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
695 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP41]], align 16
696 // CHECK32-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
697 // CHECK32-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
698 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP43]], align 16
699 // CHECK32-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
700 // CHECK32-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
701 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP45]], align 16
702 // CHECK32-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
703 // CHECK32-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
704 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP47]], align 16
705 // CHECK32-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
706 // CHECK32-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
707 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP49]], align 16
708 // CHECK32-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
709 // CHECK32-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
710 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP51]], align 16
711 // CHECK32-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
712 // CHECK32-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
713 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP53]], align 16
714 // CHECK32-NEXT:    br label [[_MM_AESENCWIDE128KL_U8_EXIT]]
715 // CHECK32:       _mm_aesencwide128kl_u8.exit:
716 // CHECK32-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
717 // CHECK32-NEXT:    ret i8 [[TMP54]]
718 //
test__mm_aesencwide128kl_u8(__m128i odata[8],const __m128i idata[8],const void * h)719 unsigned char test__mm_aesencwide128kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
720   return _mm_aesencwide128kl_u8(odata, idata, h);
721 }
722 
723 // CHECK64-LABEL: @test__mm_aesdecwide128kl_u8(
724 // CHECK64-NEXT:  entry:
725 // CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
726 // CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
727 // CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
728 // CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
729 // CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
730 // CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
731 // CHECK64-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 8
732 // CHECK64-NEXT:    store <2 x i64>* [[IDATA:%.*]], <2 x i64>** [[IDATA_ADDR]], align 8
733 // CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
734 // CHECK64-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 8
735 // CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>*, <2 x i64>** [[IDATA_ADDR]], align 8
736 // CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
737 // CHECK64-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 8
738 // CHECK64-NEXT:    store <2 x i64>* [[TMP1]], <2 x i64>** [[__IDATA_ADDR_I]], align 8
739 // CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
740 // CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
741 // CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>*, <2 x i64>** [[__IDATA_ADDR_I]], align 8
742 // CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
743 // CHECK64-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 16
744 // CHECK64-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 1
745 // CHECK64-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 16
746 // CHECK64-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 2
747 // CHECK64-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 16
748 // CHECK64-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 3
749 // CHECK64-NEXT:    [[TMP12:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 16
750 // CHECK64-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 4
751 // CHECK64-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 16
752 // CHECK64-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 5
753 // CHECK64-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 16
754 // CHECK64-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 6
755 // CHECK64-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
756 // CHECK64-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
757 // CHECK64-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
758 // CHECK64-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
759 // CHECK64-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
760 // CHECK64-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
761 // CHECK64-NEXT:    br i1 [[TMP23]], label [[AESDECWIDE128KL_NO_ERROR_I:%.*]], label [[AESDECWIDE128KL_ERROR_I:%.*]]
762 // CHECK64:       aesdecwide128kl_no_error.i:
763 // CHECK64-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
764 // CHECK64-NEXT:    store <2 x i64> [[TMP24]], <2 x i64>* [[TMP3]], align 16
765 // CHECK64-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
766 // CHECK64-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
767 // CHECK64-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[TMP26]], align 16
768 // CHECK64-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
769 // CHECK64-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
770 // CHECK64-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[TMP28]], align 16
771 // CHECK64-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
772 // CHECK64-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
773 // CHECK64-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[TMP30]], align 16
774 // CHECK64-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
775 // CHECK64-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
776 // CHECK64-NEXT:    store <2 x i64> [[TMP31]], <2 x i64>* [[TMP32]], align 16
777 // CHECK64-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
778 // CHECK64-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
779 // CHECK64-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[TMP34]], align 16
780 // CHECK64-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
781 // CHECK64-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
782 // CHECK64-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* [[TMP36]], align 16
783 // CHECK64-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
784 // CHECK64-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
785 // CHECK64-NEXT:    store <2 x i64> [[TMP37]], <2 x i64>* [[TMP38]], align 16
786 // CHECK64-NEXT:    br label [[_MM_AESDECWIDE128KL_U8_EXIT:%.*]]
787 // CHECK64:       aesdecwide128kl_error.i:
788 // CHECK64-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
789 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
790 // CHECK64-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
791 // CHECK64-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
792 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP41]], align 16
793 // CHECK64-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
794 // CHECK64-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
795 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP43]], align 16
796 // CHECK64-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
797 // CHECK64-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
798 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP45]], align 16
799 // CHECK64-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
800 // CHECK64-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
801 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP47]], align 16
802 // CHECK64-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
803 // CHECK64-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
804 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP49]], align 16
805 // CHECK64-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
806 // CHECK64-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
807 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP51]], align 16
808 // CHECK64-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
809 // CHECK64-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
810 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP53]], align 16
811 // CHECK64-NEXT:    br label [[_MM_AESDECWIDE128KL_U8_EXIT]]
812 // CHECK64:       _mm_aesdecwide128kl_u8.exit:
813 // CHECK64-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
814 // CHECK64-NEXT:    ret i8 [[TMP54]]
815 //
816 // CHECK32-LABEL: @test__mm_aesdecwide128kl_u8(
817 // CHECK32-NEXT:  entry:
818 // CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
819 // CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
820 // CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
821 // CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
822 // CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
823 // CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
824 // CHECK32-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 4
825 // CHECK32-NEXT:    store <2 x i64>* [[IDATA:%.*]], <2 x i64>** [[IDATA_ADDR]], align 4
826 // CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
827 // CHECK32-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 4
828 // CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>*, <2 x i64>** [[IDATA_ADDR]], align 4
829 // CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
830 // CHECK32-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 4
831 // CHECK32-NEXT:    store <2 x i64>* [[TMP1]], <2 x i64>** [[__IDATA_ADDR_I]], align 4
832 // CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
833 // CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
834 // CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>*, <2 x i64>** [[__IDATA_ADDR_I]], align 4
835 // CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
836 // CHECK32-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 16
837 // CHECK32-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 1
838 // CHECK32-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 16
839 // CHECK32-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 2
840 // CHECK32-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 16
841 // CHECK32-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 3
842 // CHECK32-NEXT:    [[TMP12:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 16
843 // CHECK32-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 4
844 // CHECK32-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 16
845 // CHECK32-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 5
846 // CHECK32-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 16
847 // CHECK32-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 6
848 // CHECK32-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
849 // CHECK32-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
850 // CHECK32-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
851 // CHECK32-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
852 // CHECK32-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
853 // CHECK32-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
854 // CHECK32-NEXT:    br i1 [[TMP23]], label [[AESDECWIDE128KL_NO_ERROR_I:%.*]], label [[AESDECWIDE128KL_ERROR_I:%.*]]
855 // CHECK32:       aesdecwide128kl_no_error.i:
856 // CHECK32-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
857 // CHECK32-NEXT:    store <2 x i64> [[TMP24]], <2 x i64>* [[TMP3]], align 16
858 // CHECK32-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
859 // CHECK32-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
860 // CHECK32-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[TMP26]], align 16
861 // CHECK32-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
862 // CHECK32-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
863 // CHECK32-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[TMP28]], align 16
864 // CHECK32-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
865 // CHECK32-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
866 // CHECK32-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[TMP30]], align 16
867 // CHECK32-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
868 // CHECK32-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
869 // CHECK32-NEXT:    store <2 x i64> [[TMP31]], <2 x i64>* [[TMP32]], align 16
870 // CHECK32-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
871 // CHECK32-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
872 // CHECK32-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[TMP34]], align 16
873 // CHECK32-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
874 // CHECK32-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
875 // CHECK32-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* [[TMP36]], align 16
876 // CHECK32-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
877 // CHECK32-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
878 // CHECK32-NEXT:    store <2 x i64> [[TMP37]], <2 x i64>* [[TMP38]], align 16
879 // CHECK32-NEXT:    br label [[_MM_AESDECWIDE128KL_U8_EXIT:%.*]]
880 // CHECK32:       aesdecwide128kl_error.i:
881 // CHECK32-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
882 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
883 // CHECK32-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
884 // CHECK32-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
885 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP41]], align 16
886 // CHECK32-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
887 // CHECK32-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
888 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP43]], align 16
889 // CHECK32-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
890 // CHECK32-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
891 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP45]], align 16
892 // CHECK32-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
893 // CHECK32-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
894 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP47]], align 16
895 // CHECK32-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
896 // CHECK32-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
897 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP49]], align 16
898 // CHECK32-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
899 // CHECK32-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
900 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP51]], align 16
901 // CHECK32-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
902 // CHECK32-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
903 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP53]], align 16
904 // CHECK32-NEXT:    br label [[_MM_AESDECWIDE128KL_U8_EXIT]]
905 // CHECK32:       _mm_aesdecwide128kl_u8.exit:
906 // CHECK32-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
907 // CHECK32-NEXT:    ret i8 [[TMP54]]
908 //
test__mm_aesdecwide128kl_u8(__m128i odata[8],const __m128i idata[8],const void * h)909 unsigned char test__mm_aesdecwide128kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
910   return _mm_aesdecwide128kl_u8(odata, idata, h);
911 }
912 
913 // CHECK64-LABEL: @test__mm_aesencwide256kl_u8(
914 // CHECK64-NEXT:  entry:
915 // CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
916 // CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
917 // CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
918 // CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
919 // CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
920 // CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
921 // CHECK64-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 8
922 // CHECK64-NEXT:    store <2 x i64>* [[IDATA:%.*]], <2 x i64>** [[IDATA_ADDR]], align 8
923 // CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
924 // CHECK64-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 8
925 // CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>*, <2 x i64>** [[IDATA_ADDR]], align 8
926 // CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
927 // CHECK64-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 8
928 // CHECK64-NEXT:    store <2 x i64>* [[TMP1]], <2 x i64>** [[__IDATA_ADDR_I]], align 8
929 // CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
930 // CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
931 // CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>*, <2 x i64>** [[__IDATA_ADDR_I]], align 8
932 // CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
933 // CHECK64-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 16
934 // CHECK64-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 1
935 // CHECK64-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 16
936 // CHECK64-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 2
937 // CHECK64-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 16
938 // CHECK64-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 3
939 // CHECK64-NEXT:    [[TMP12:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 16
940 // CHECK64-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 4
941 // CHECK64-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 16
942 // CHECK64-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 5
943 // CHECK64-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 16
944 // CHECK64-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 6
945 // CHECK64-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
946 // CHECK64-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
947 // CHECK64-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
948 // CHECK64-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
949 // CHECK64-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
950 // CHECK64-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
951 // CHECK64-NEXT:    br i1 [[TMP23]], label [[AESENCWIDE256KL_NO_ERROR_I:%.*]], label [[AESENCWIDE256KL_ERROR_I:%.*]]
952 // CHECK64:       aesencwide256kl_no_error.i:
953 // CHECK64-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
954 // CHECK64-NEXT:    store <2 x i64> [[TMP24]], <2 x i64>* [[TMP3]], align 16
955 // CHECK64-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
956 // CHECK64-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
957 // CHECK64-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[TMP26]], align 16
958 // CHECK64-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
959 // CHECK64-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
960 // CHECK64-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[TMP28]], align 16
961 // CHECK64-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
962 // CHECK64-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
963 // CHECK64-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[TMP30]], align 16
964 // CHECK64-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
965 // CHECK64-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
966 // CHECK64-NEXT:    store <2 x i64> [[TMP31]], <2 x i64>* [[TMP32]], align 16
967 // CHECK64-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
968 // CHECK64-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
969 // CHECK64-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[TMP34]], align 16
970 // CHECK64-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
971 // CHECK64-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
972 // CHECK64-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* [[TMP36]], align 16
973 // CHECK64-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
974 // CHECK64-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
975 // CHECK64-NEXT:    store <2 x i64> [[TMP37]], <2 x i64>* [[TMP38]], align 16
976 // CHECK64-NEXT:    br label [[_MM_AESENCWIDE256KL_U8_EXIT:%.*]]
977 // CHECK64:       aesencwide256kl_error.i:
978 // CHECK64-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
979 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
980 // CHECK64-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
981 // CHECK64-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
982 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP41]], align 16
983 // CHECK64-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
984 // CHECK64-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
985 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP43]], align 16
986 // CHECK64-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
987 // CHECK64-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
988 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP45]], align 16
989 // CHECK64-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
990 // CHECK64-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
991 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP47]], align 16
992 // CHECK64-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
993 // CHECK64-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
994 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP49]], align 16
995 // CHECK64-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
996 // CHECK64-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
997 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP51]], align 16
998 // CHECK64-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
999 // CHECK64-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
1000 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP53]], align 16
1001 // CHECK64-NEXT:    br label [[_MM_AESENCWIDE256KL_U8_EXIT]]
1002 // CHECK64:       _mm_aesencwide256kl_u8.exit:
1003 // CHECK64-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
1004 // CHECK64-NEXT:    ret i8 [[TMP54]]
1005 //
1006 // CHECK32-LABEL: @test__mm_aesencwide256kl_u8(
1007 // CHECK32-NEXT:  entry:
1008 // CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
1009 // CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
1010 // CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
1011 // CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
1012 // CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
1013 // CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
1014 // CHECK32-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 4
1015 // CHECK32-NEXT:    store <2 x i64>* [[IDATA:%.*]], <2 x i64>** [[IDATA_ADDR]], align 4
1016 // CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
1017 // CHECK32-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 4
1018 // CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>*, <2 x i64>** [[IDATA_ADDR]], align 4
1019 // CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
1020 // CHECK32-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 4
1021 // CHECK32-NEXT:    store <2 x i64>* [[TMP1]], <2 x i64>** [[__IDATA_ADDR_I]], align 4
1022 // CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
1023 // CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
1024 // CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>*, <2 x i64>** [[__IDATA_ADDR_I]], align 4
1025 // CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
1026 // CHECK32-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 16
1027 // CHECK32-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 1
1028 // CHECK32-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 16
1029 // CHECK32-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 2
1030 // CHECK32-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 16
1031 // CHECK32-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 3
1032 // CHECK32-NEXT:    [[TMP12:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 16
1033 // CHECK32-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 4
1034 // CHECK32-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 16
1035 // CHECK32-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 5
1036 // CHECK32-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 16
1037 // CHECK32-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 6
1038 // CHECK32-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
1039 // CHECK32-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
1040 // CHECK32-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
1041 // CHECK32-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
1042 // CHECK32-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
1043 // CHECK32-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
1044 // CHECK32-NEXT:    br i1 [[TMP23]], label [[AESENCWIDE256KL_NO_ERROR_I:%.*]], label [[AESENCWIDE256KL_ERROR_I:%.*]]
1045 // CHECK32:       aesencwide256kl_no_error.i:
1046 // CHECK32-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
1047 // CHECK32-NEXT:    store <2 x i64> [[TMP24]], <2 x i64>* [[TMP3]], align 16
1048 // CHECK32-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
1049 // CHECK32-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
1050 // CHECK32-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[TMP26]], align 16
1051 // CHECK32-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
1052 // CHECK32-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
1053 // CHECK32-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[TMP28]], align 16
1054 // CHECK32-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
1055 // CHECK32-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
1056 // CHECK32-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[TMP30]], align 16
1057 // CHECK32-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
1058 // CHECK32-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
1059 // CHECK32-NEXT:    store <2 x i64> [[TMP31]], <2 x i64>* [[TMP32]], align 16
1060 // CHECK32-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
1061 // CHECK32-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
1062 // CHECK32-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[TMP34]], align 16
1063 // CHECK32-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
1064 // CHECK32-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
1065 // CHECK32-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* [[TMP36]], align 16
1066 // CHECK32-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
1067 // CHECK32-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
1068 // CHECK32-NEXT:    store <2 x i64> [[TMP37]], <2 x i64>* [[TMP38]], align 16
1069 // CHECK32-NEXT:    br label [[_MM_AESENCWIDE256KL_U8_EXIT:%.*]]
1070 // CHECK32:       aesencwide256kl_error.i:
1071 // CHECK32-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
1072 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
1073 // CHECK32-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
1074 // CHECK32-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
1075 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP41]], align 16
1076 // CHECK32-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
1077 // CHECK32-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
1078 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP43]], align 16
1079 // CHECK32-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
1080 // CHECK32-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
1081 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP45]], align 16
1082 // CHECK32-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
1083 // CHECK32-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
1084 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP47]], align 16
1085 // CHECK32-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
1086 // CHECK32-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
1087 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP49]], align 16
1088 // CHECK32-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
1089 // CHECK32-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
1090 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP51]], align 16
1091 // CHECK32-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
1092 // CHECK32-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
1093 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP53]], align 16
1094 // CHECK32-NEXT:    br label [[_MM_AESENCWIDE256KL_U8_EXIT]]
1095 // CHECK32:       _mm_aesencwide256kl_u8.exit:
1096 // CHECK32-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
1097 // CHECK32-NEXT:    ret i8 [[TMP54]]
1098 //
test__mm_aesencwide256kl_u8(__m128i odata[8],const __m128i idata[8],const void * h)1099 unsigned char test__mm_aesencwide256kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
1100   return _mm_aesencwide256kl_u8(odata, idata, h);
1101 }
1102 
1103 // CHECK64-LABEL: @test__mm_aesdecwide256kl_u8(
1104 // CHECK64-NEXT:  entry:
1105 // CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
1106 // CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
1107 // CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
1108 // CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
1109 // CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
1110 // CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
1111 // CHECK64-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 8
1112 // CHECK64-NEXT:    store <2 x i64>* [[IDATA:%.*]], <2 x i64>** [[IDATA_ADDR]], align 8
1113 // CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
1114 // CHECK64-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 8
1115 // CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>*, <2 x i64>** [[IDATA_ADDR]], align 8
1116 // CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
1117 // CHECK64-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 8
1118 // CHECK64-NEXT:    store <2 x i64>* [[TMP1]], <2 x i64>** [[__IDATA_ADDR_I]], align 8
1119 // CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
1120 // CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
1121 // CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>*, <2 x i64>** [[__IDATA_ADDR_I]], align 8
1122 // CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
1123 // CHECK64-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 16
1124 // CHECK64-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 1
1125 // CHECK64-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 16
1126 // CHECK64-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 2
1127 // CHECK64-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 16
1128 // CHECK64-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 3
1129 // CHECK64-NEXT:    [[TMP12:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 16
1130 // CHECK64-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 4
1131 // CHECK64-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 16
1132 // CHECK64-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 5
1133 // CHECK64-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 16
1134 // CHECK64-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 6
1135 // CHECK64-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
1136 // CHECK64-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
1137 // CHECK64-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
1138 // CHECK64-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
1139 // CHECK64-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
1140 // CHECK64-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
1141 // CHECK64-NEXT:    br i1 [[TMP23]], label [[AESDECWIDE256KL_NO_ERROR_I:%.*]], label [[AESDECWIDE256KL_ERROR_I:%.*]]
1142 // CHECK64:       aesdecwide256kl_no_error.i:
1143 // CHECK64-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
1144 // CHECK64-NEXT:    store <2 x i64> [[TMP24]], <2 x i64>* [[TMP3]], align 16
1145 // CHECK64-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
1146 // CHECK64-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
1147 // CHECK64-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[TMP26]], align 16
1148 // CHECK64-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
1149 // CHECK64-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
1150 // CHECK64-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[TMP28]], align 16
1151 // CHECK64-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
1152 // CHECK64-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
1153 // CHECK64-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[TMP30]], align 16
1154 // CHECK64-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
1155 // CHECK64-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
1156 // CHECK64-NEXT:    store <2 x i64> [[TMP31]], <2 x i64>* [[TMP32]], align 16
1157 // CHECK64-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
1158 // CHECK64-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
1159 // CHECK64-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[TMP34]], align 16
1160 // CHECK64-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
1161 // CHECK64-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
1162 // CHECK64-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* [[TMP36]], align 16
1163 // CHECK64-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
1164 // CHECK64-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
1165 // CHECK64-NEXT:    store <2 x i64> [[TMP37]], <2 x i64>* [[TMP38]], align 16
1166 // CHECK64-NEXT:    br label [[_MM_AESDECWIDE256KL_U8_EXIT:%.*]]
1167 // CHECK64:       aesdecwide256kl_error.i:
1168 // CHECK64-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
1169 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
1170 // CHECK64-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
1171 // CHECK64-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
1172 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP41]], align 16
1173 // CHECK64-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
1174 // CHECK64-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
1175 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP43]], align 16
1176 // CHECK64-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
1177 // CHECK64-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
1178 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP45]], align 16
1179 // CHECK64-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
1180 // CHECK64-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
1181 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP47]], align 16
1182 // CHECK64-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
1183 // CHECK64-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
1184 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP49]], align 16
1185 // CHECK64-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
1186 // CHECK64-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
1187 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP51]], align 16
1188 // CHECK64-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
1189 // CHECK64-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
1190 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP53]], align 16
1191 // CHECK64-NEXT:    br label [[_MM_AESDECWIDE256KL_U8_EXIT]]
1192 // CHECK64:       _mm_aesdecwide256kl_u8.exit:
1193 // CHECK64-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
1194 // CHECK64-NEXT:    ret i8 [[TMP54]]
1195 //
1196 // CHECK32-LABEL: @test__mm_aesdecwide256kl_u8(
1197 // CHECK32-NEXT:  entry:
1198 // CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
1199 // CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
1200 // CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
1201 // CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
1202 // CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
1203 // CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
1204 // CHECK32-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 4
1205 // CHECK32-NEXT:    store <2 x i64>* [[IDATA:%.*]], <2 x i64>** [[IDATA_ADDR]], align 4
1206 // CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
1207 // CHECK32-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 4
1208 // CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>*, <2 x i64>** [[IDATA_ADDR]], align 4
1209 // CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
1210 // CHECK32-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 4
1211 // CHECK32-NEXT:    store <2 x i64>* [[TMP1]], <2 x i64>** [[__IDATA_ADDR_I]], align 4
1212 // CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
1213 // CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
1214 // CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>*, <2 x i64>** [[__IDATA_ADDR_I]], align 4
1215 // CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
1216 // CHECK32-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 16
1217 // CHECK32-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 1
1218 // CHECK32-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 16
1219 // CHECK32-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 2
1220 // CHECK32-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 16
1221 // CHECK32-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 3
1222 // CHECK32-NEXT:    [[TMP12:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 16
1223 // CHECK32-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 4
1224 // CHECK32-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 16
1225 // CHECK32-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 5
1226 // CHECK32-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 16
1227 // CHECK32-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 6
1228 // CHECK32-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
1229 // CHECK32-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
1230 // CHECK32-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
1231 // CHECK32-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
1232 // CHECK32-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
1233 // CHECK32-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
1234 // CHECK32-NEXT:    br i1 [[TMP23]], label [[AESDECWIDE256KL_NO_ERROR_I:%.*]], label [[AESDECWIDE256KL_ERROR_I:%.*]]
1235 // CHECK32:       aesdecwide256kl_no_error.i:
1236 // CHECK32-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
1237 // CHECK32-NEXT:    store <2 x i64> [[TMP24]], <2 x i64>* [[TMP3]], align 16
1238 // CHECK32-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
1239 // CHECK32-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
1240 // CHECK32-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[TMP26]], align 16
1241 // CHECK32-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
1242 // CHECK32-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
1243 // CHECK32-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[TMP28]], align 16
1244 // CHECK32-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
1245 // CHECK32-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
1246 // CHECK32-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[TMP30]], align 16
1247 // CHECK32-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
1248 // CHECK32-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
1249 // CHECK32-NEXT:    store <2 x i64> [[TMP31]], <2 x i64>* [[TMP32]], align 16
1250 // CHECK32-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
1251 // CHECK32-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
1252 // CHECK32-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[TMP34]], align 16
1253 // CHECK32-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
1254 // CHECK32-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
1255 // CHECK32-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* [[TMP36]], align 16
1256 // CHECK32-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
1257 // CHECK32-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
1258 // CHECK32-NEXT:    store <2 x i64> [[TMP37]], <2 x i64>* [[TMP38]], align 16
1259 // CHECK32-NEXT:    br label [[_MM_AESDECWIDE256KL_U8_EXIT:%.*]]
1260 // CHECK32:       aesdecwide256kl_error.i:
1261 // CHECK32-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
1262 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
1263 // CHECK32-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
1264 // CHECK32-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
1265 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP41]], align 16
1266 // CHECK32-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
1267 // CHECK32-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
1268 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP43]], align 16
1269 // CHECK32-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
1270 // CHECK32-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
1271 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP45]], align 16
1272 // CHECK32-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
1273 // CHECK32-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
1274 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP47]], align 16
1275 // CHECK32-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
1276 // CHECK32-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
1277 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP49]], align 16
1278 // CHECK32-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
1279 // CHECK32-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
1280 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP51]], align 16
1281 // CHECK32-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
1282 // CHECK32-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
1283 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP53]], align 16
1284 // CHECK32-NEXT:    br label [[_MM_AESDECWIDE256KL_U8_EXIT]]
1285 // CHECK32:       _mm_aesdecwide256kl_u8.exit:
1286 // CHECK32-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
1287 // CHECK32-NEXT:    ret i8 [[TMP54]]
1288 //
test__mm_aesdecwide256kl_u8(__m128i odata[8],const __m128i idata[8],const void * h)1289 unsigned char test__mm_aesdecwide256kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
1290   return _mm_aesdecwide256kl_u8(odata, idata, h);
1291 }
1292