1 /*-
2 * Copyright (c) 2016 The FreeBSD Foundation
3 * Copyright (c) 2020 Ampere Computing
4 * All rights reserved.
5 *
6 * This software was developed by Andrew Turner under
7 * sponsorship from the FreeBSD Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 /*
32 * This code is built with floating-point enabled. Make sure to have entered
33 * into floating-point context before calling any of these functions.
34 */
35
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
38
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/malloc.h>
42 #include <sys/queue.h>
43
44 #include <opencrypto/cryptodev.h>
45 #include <opencrypto/gmac.h>
46 #include <crypto/rijndael/rijndael.h>
47 #include <crypto/armv8/armv8_crypto.h>
48
49 #include <arm_neon.h>
50
51 static uint8x16_t
armv8_aes_enc(int rounds,const uint8x16_t * keysched,const uint8x16_t from)52 armv8_aes_enc(int rounds, const uint8x16_t *keysched, const uint8x16_t from)
53 {
54 uint8x16_t tmp;
55 int i;
56
57 tmp = from;
58 for (i = 0; i < rounds - 1; i += 2) {
59 tmp = vaeseq_u8(tmp, keysched[i]);
60 tmp = vaesmcq_u8(tmp);
61 tmp = vaeseq_u8(tmp, keysched[i + 1]);
62 tmp = vaesmcq_u8(tmp);
63 }
64
65 tmp = vaeseq_u8(tmp, keysched[rounds - 1]);
66 tmp = vaesmcq_u8(tmp);
67 tmp = vaeseq_u8(tmp, keysched[rounds]);
68 tmp = veorq_u8(tmp, keysched[rounds + 1]);
69
70 return (tmp);
71 }
72
73 static uint8x16_t
armv8_aes_dec(int rounds,const uint8x16_t * keysched,const uint8x16_t from)74 armv8_aes_dec(int rounds, const uint8x16_t *keysched, const uint8x16_t from)
75 {
76 uint8x16_t tmp;
77 int i;
78
79 tmp = from;
80 for (i = 0; i < rounds - 1; i += 2) {
81 tmp = vaesdq_u8(tmp, keysched[i]);
82 tmp = vaesimcq_u8(tmp);
83 tmp = vaesdq_u8(tmp, keysched[i+1]);
84 tmp = vaesimcq_u8(tmp);
85 }
86
87 tmp = vaesdq_u8(tmp, keysched[rounds - 1]);
88 tmp = vaesimcq_u8(tmp);
89 tmp = vaesdq_u8(tmp, keysched[rounds]);
90 tmp = veorq_u8(tmp, keysched[rounds + 1]);
91
92 return (tmp);
93 }
94
95 void
armv8_aes_encrypt_cbc(const AES_key_t * key,size_t len,const uint8_t * from,uint8_t * to,const uint8_t iv[static AES_BLOCK_LEN])96 armv8_aes_encrypt_cbc(const AES_key_t *key, size_t len,
97 const uint8_t *from, uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN])
98 {
99 uint8x16_t tot, ivreg, tmp;
100 size_t i;
101
102 len /= AES_BLOCK_LEN;
103 ivreg = vld1q_u8(iv);
104 for (i = 0; i < len; i++) {
105 tmp = vld1q_u8(from);
106 tot = armv8_aes_enc(key->aes_rounds - 1,
107 (const void*)key->aes_key, veorq_u8(tmp, ivreg));
108 ivreg = tot;
109 vst1q_u8(to, tot);
110 from += AES_BLOCK_LEN;
111 to += AES_BLOCK_LEN;
112 }
113 }
114
115 void
armv8_aes_decrypt_cbc(const AES_key_t * key,size_t len,uint8_t * buf,const uint8_t iv[static AES_BLOCK_LEN])116 armv8_aes_decrypt_cbc(const AES_key_t *key, size_t len,
117 uint8_t *buf, const uint8_t iv[static AES_BLOCK_LEN])
118 {
119 uint8x16_t ivreg, nextiv, tmp;
120 size_t i;
121
122 len /= AES_BLOCK_LEN;
123 ivreg = vld1q_u8(iv);
124 for (i = 0; i < len; i++) {
125 nextiv = vld1q_u8(buf);
126 tmp = armv8_aes_dec(key->aes_rounds - 1,
127 (const void*)key->aes_key, nextiv);
128 vst1q_u8(buf, veorq_u8(tmp, ivreg));
129 ivreg = nextiv;
130 buf += AES_BLOCK_LEN;
131 }
132 }
133
134 #define AES_XTS_BLOCKSIZE 16
135 #define AES_XTS_IVSIZE 8
136 #define AES_XTS_ALPHA 0x87 /* GF(2^128) generator polynomial */
137
138 static inline int32x4_t
xts_crank_lfsr(int32x4_t inp)139 xts_crank_lfsr(int32x4_t inp)
140 {
141 const int32x4_t alphamask = {AES_XTS_ALPHA, 1, 1, 1};
142 int32x4_t xtweak, ret;
143
144 /* set up xor mask */
145 xtweak = vextq_s32(inp, inp, 3);
146 xtweak = vshrq_n_s32(xtweak, 31);
147 xtweak &= alphamask;
148
149 /* next term */
150 ret = vshlq_n_s32(inp, 1);
151 ret ^= xtweak;
152
153 return ret;
154 }
155
156 static void
armv8_aes_crypt_xts_block(int rounds,const uint8x16_t * key_schedule,uint8x16_t * tweak,const uint8_t * from,uint8_t * to,int do_encrypt)157 armv8_aes_crypt_xts_block(int rounds, const uint8x16_t *key_schedule,
158 uint8x16_t *tweak, const uint8_t *from, uint8_t *to, int do_encrypt)
159 {
160 uint8x16_t block;
161
162 block = vld1q_u8(from) ^ *tweak;
163
164 if (do_encrypt)
165 block = armv8_aes_enc(rounds - 1, key_schedule, block);
166 else
167 block = armv8_aes_dec(rounds - 1, key_schedule, block);
168
169 vst1q_u8(to, block ^ *tweak);
170
171 *tweak = vreinterpretq_u8_s32(xts_crank_lfsr(vreinterpretq_s32_u8(*tweak)));
172 }
173
174 static void
armv8_aes_crypt_xts(int rounds,const uint8x16_t * data_schedule,const uint8x16_t * tweak_schedule,size_t len,const uint8_t * from,uint8_t * to,const uint8_t iv[static AES_BLOCK_LEN],int do_encrypt)175 armv8_aes_crypt_xts(int rounds, const uint8x16_t *data_schedule,
176 const uint8x16_t *tweak_schedule, size_t len, const uint8_t *from,
177 uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN], int do_encrypt)
178 {
179 uint8x16_t tweakreg;
180 uint8_t tweak[AES_XTS_BLOCKSIZE] __aligned(16);
181 size_t i, cnt;
182
183 /*
184 * Prepare tweak as E_k2(IV). IV is specified as LE representation
185 * of a 64-bit block number which we allow to be passed in directly.
186 */
187 #if BYTE_ORDER == LITTLE_ENDIAN
188 bcopy(iv, tweak, AES_XTS_IVSIZE);
189 /* Last 64 bits of IV are always zero. */
190 bzero(tweak + AES_XTS_IVSIZE, AES_XTS_IVSIZE);
191 #else
192 #error Only LITTLE_ENDIAN architectures are supported.
193 #endif
194 tweakreg = vld1q_u8(tweak);
195 tweakreg = armv8_aes_enc(rounds - 1, tweak_schedule, tweakreg);
196
197 cnt = len / AES_XTS_BLOCKSIZE;
198 for (i = 0; i < cnt; i++) {
199 armv8_aes_crypt_xts_block(rounds, data_schedule, &tweakreg,
200 from, to, do_encrypt);
201 from += AES_XTS_BLOCKSIZE;
202 to += AES_XTS_BLOCKSIZE;
203 }
204 }
205
206 void
armv8_aes_encrypt_xts(AES_key_t * data_schedule,const void * tweak_schedule,size_t len,const uint8_t * from,uint8_t * to,const uint8_t iv[static AES_BLOCK_LEN])207 armv8_aes_encrypt_xts(AES_key_t *data_schedule,
208 const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to,
209 const uint8_t iv[static AES_BLOCK_LEN])
210 {
211
212 armv8_aes_crypt_xts(data_schedule->aes_rounds,
213 (const void *)&data_schedule->aes_key, tweak_schedule, len, from,
214 to, iv, 1);
215 }
216
217 void
armv8_aes_decrypt_xts(AES_key_t * data_schedule,const void * tweak_schedule,size_t len,const uint8_t * from,uint8_t * to,const uint8_t iv[static AES_BLOCK_LEN])218 armv8_aes_decrypt_xts(AES_key_t *data_schedule,
219 const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to,
220 const uint8_t iv[static AES_BLOCK_LEN])
221 {
222
223 armv8_aes_crypt_xts(data_schedule->aes_rounds,
224 (const void *)&data_schedule->aes_key, tweak_schedule, len, from,
225 to,iv, 0);
226
227 }
228
229 #define AES_INC_COUNTER(counter) \
230 do { \
231 for (int pos = AES_BLOCK_LEN - 1; \
232 pos >= 0; pos--) \
233 if (++(counter)[pos]) \
234 break; \
235 } while (0)
236
237 void
armv8_aes_encrypt_gcm(AES_key_t * aes_key,size_t len,const uint8_t * from,uint8_t * to,size_t authdatalen,const uint8_t * authdata,uint8_t tag[static GMAC_DIGEST_LEN],const uint8_t iv[static AES_GCM_IV_LEN],const __uint128_val_t * Htable)238 armv8_aes_encrypt_gcm(AES_key_t *aes_key, size_t len,
239 const uint8_t *from, uint8_t *to,
240 size_t authdatalen, const uint8_t *authdata,
241 uint8_t tag[static GMAC_DIGEST_LEN],
242 const uint8_t iv[static AES_GCM_IV_LEN],
243 const __uint128_val_t *Htable)
244 {
245 size_t i;
246 const uint64_t *from64;
247 uint64_t *to64;
248 uint8_t aes_counter[AES_BLOCK_LEN];
249 uint8_t block[AES_BLOCK_LEN];
250 size_t trailer;
251 __uint128_val_t EK0, EKi, Xi, lenblock;
252
253 bzero(&aes_counter, AES_BLOCK_LEN);
254 memcpy(aes_counter, iv, AES_GCM_IV_LEN);
255
256 /* Setup the counter */
257 aes_counter[AES_BLOCK_LEN - 1] = 1;
258
259 /* EK0 for a final GMAC round */
260 aes_v8_encrypt(aes_counter, EK0.c, aes_key);
261
262 /* GCM starts with 2 as counter, 1 is used for final xor of tag. */
263 aes_counter[AES_BLOCK_LEN - 1] = 2;
264
265 memset(Xi.c, 0, sizeof(Xi.c));
266 trailer = authdatalen % AES_BLOCK_LEN;
267 if (authdatalen - trailer > 0) {
268 gcm_ghash_v8(Xi.u, Htable, authdata, authdatalen - trailer);
269 authdata += authdatalen - trailer;
270 }
271 if (trailer > 0 || authdatalen == 0) {
272 memset(block, 0, sizeof(block));
273 memcpy(block, authdata, trailer);
274 gcm_ghash_v8(Xi.u, Htable, block, AES_BLOCK_LEN);
275 }
276
277 from64 = (const uint64_t*)from;
278 to64 = (uint64_t*)to;
279 trailer = len % AES_BLOCK_LEN;
280
281 for (i = 0; i < (len - trailer); i += AES_BLOCK_LEN) {
282 aes_v8_encrypt(aes_counter, EKi.c, aes_key);
283 AES_INC_COUNTER(aes_counter);
284 to64[0] = from64[0] ^ EKi.u[0];
285 to64[1] = from64[1] ^ EKi.u[1];
286 gcm_ghash_v8(Xi.u, Htable, (uint8_t*)to64, AES_BLOCK_LEN);
287
288 to64 += 2;
289 from64 += 2;
290 }
291
292 to += (len - trailer);
293 from += (len - trailer);
294
295 if (trailer) {
296 aes_v8_encrypt(aes_counter, EKi.c, aes_key);
297 AES_INC_COUNTER(aes_counter);
298 memset(block, 0, sizeof(block));
299 for (i = 0; i < trailer; i++) {
300 block[i] = to[i] = from[i] ^ EKi.c[i];
301 }
302
303 gcm_ghash_v8(Xi.u, Htable, block, AES_BLOCK_LEN);
304 }
305
306 /* Lengths block */
307 lenblock.u[0] = lenblock.u[1] = 0;
308 lenblock.d[1] = htobe32(authdatalen * 8);
309 lenblock.d[3] = htobe32(len * 8);
310 gcm_ghash_v8(Xi.u, Htable, lenblock.c, AES_BLOCK_LEN);
311
312 Xi.u[0] ^= EK0.u[0];
313 Xi.u[1] ^= EK0.u[1];
314 memcpy(tag, Xi.c, GMAC_DIGEST_LEN);
315
316 explicit_bzero(aes_counter, sizeof(aes_counter));
317 explicit_bzero(Xi.c, sizeof(Xi.c));
318 explicit_bzero(EK0.c, sizeof(EK0.c));
319 explicit_bzero(EKi.c, sizeof(EKi.c));
320 explicit_bzero(lenblock.c, sizeof(lenblock.c));
321 }
322
323 int
armv8_aes_decrypt_gcm(AES_key_t * aes_key,size_t len,const uint8_t * from,uint8_t * to,size_t authdatalen,const uint8_t * authdata,const uint8_t tag[static GMAC_DIGEST_LEN],const uint8_t iv[static AES_GCM_IV_LEN],const __uint128_val_t * Htable)324 armv8_aes_decrypt_gcm(AES_key_t *aes_key, size_t len,
325 const uint8_t *from, uint8_t *to,
326 size_t authdatalen, const uint8_t *authdata,
327 const uint8_t tag[static GMAC_DIGEST_LEN],
328 const uint8_t iv[static AES_GCM_IV_LEN],
329 const __uint128_val_t *Htable)
330 {
331 size_t i;
332 const uint64_t *from64;
333 uint64_t *to64;
334 uint8_t aes_counter[AES_BLOCK_LEN];
335 uint8_t block[AES_BLOCK_LEN];
336 size_t trailer;
337 __uint128_val_t EK0, EKi, Xi, lenblock;
338 int error;
339
340 error = 0;
341 bzero(&aes_counter, AES_BLOCK_LEN);
342 memcpy(aes_counter, iv, AES_GCM_IV_LEN);
343
344 /* Setup the counter */
345 aes_counter[AES_BLOCK_LEN - 1] = 1;
346
347 /* EK0 for a final GMAC round */
348 aes_v8_encrypt(aes_counter, EK0.c, aes_key);
349
350 memset(Xi.c, 0, sizeof(Xi.c));
351 trailer = authdatalen % AES_BLOCK_LEN;
352 if (authdatalen - trailer > 0) {
353 gcm_ghash_v8(Xi.u, Htable, authdata, authdatalen - trailer);
354 authdata += authdatalen - trailer;
355 }
356 if (trailer > 0 || authdatalen == 0) {
357 memset(block, 0, sizeof(block));
358 memcpy(block, authdata, trailer);
359 gcm_ghash_v8(Xi.u, Htable, block, AES_BLOCK_LEN);
360 }
361
362 trailer = len % AES_BLOCK_LEN;
363 if (len - trailer > 0)
364 gcm_ghash_v8(Xi.u, Htable, from, len - trailer);
365 if (trailer > 0) {
366 memset(block, 0, sizeof(block));
367 memcpy(block, from + len - trailer, trailer);
368 gcm_ghash_v8(Xi.u, Htable, block, AES_BLOCK_LEN);
369 }
370
371 /* Lengths block */
372 lenblock.u[0] = lenblock.u[1] = 0;
373 lenblock.d[1] = htobe32(authdatalen * 8);
374 lenblock.d[3] = htobe32(len * 8);
375 gcm_ghash_v8(Xi.u, Htable, lenblock.c, AES_BLOCK_LEN);
376
377 Xi.u[0] ^= EK0.u[0];
378 Xi.u[1] ^= EK0.u[1];
379 if (timingsafe_bcmp(tag, Xi.c, GMAC_DIGEST_LEN) != 0) {
380 error = EBADMSG;
381 goto out;
382 }
383
384 /* GCM starts with 2 as counter, 1 is used for final xor of tag. */
385 aes_counter[AES_BLOCK_LEN - 1] = 2;
386
387 from64 = (const uint64_t*)from;
388 to64 = (uint64_t*)to;
389
390 for (i = 0; i < (len - trailer); i += AES_BLOCK_LEN) {
391 aes_v8_encrypt(aes_counter, EKi.c, aes_key);
392 AES_INC_COUNTER(aes_counter);
393 to64[0] = from64[0] ^ EKi.u[0];
394 to64[1] = from64[1] ^ EKi.u[1];
395 to64 += 2;
396 from64 += 2;
397 }
398
399 to += (len - trailer);
400 from += (len - trailer);
401
402 if (trailer) {
403 aes_v8_encrypt(aes_counter, EKi.c, aes_key);
404 AES_INC_COUNTER(aes_counter);
405 for (i = 0; i < trailer; i++)
406 to[i] = from[i] ^ EKi.c[i];
407 }
408
409 out:
410 explicit_bzero(aes_counter, sizeof(aes_counter));
411 explicit_bzero(Xi.c, sizeof(Xi.c));
412 explicit_bzero(EK0.c, sizeof(EK0.c));
413 explicit_bzero(EKi.c, sizeof(EKi.c));
414 explicit_bzero(lenblock.c, sizeof(lenblock.c));
415
416 return (error);
417 }
418