1 /*-
2 * Copyright (c) 2011-2012 Michihiro NAKAJIMA
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25 #include "test.h"
26 __FBSDID("$FreeBSD$");
27
28 #include <locale.h>
29
30 #define __LIBARCHIVE_TEST
31 #include "archive_string.h"
32
33 /*
34 Execute the following to rebuild the data for this program:
35 tail -n +36 test_archive_string_conversion.c | /bin/sh
36 #
37 # This requires http://unicode.org/Public/6.0.0/ucd/NormalizationTest.txt
38 #
39 if="NormalizationTest.txt"
40 if [ ! -f ${if} ]; then
41 echo "Not found: \"${if}\""
42 exit 0
43 fi
44 of=test_archive_string_conversion.txt.Z
45 echo "\$FreeBSD\$" > ${of}.uu
46 awk -F ';' '$0 ~/^[0-9A-F]+/ {printf "%s;%s\n", $2, $3}' ${if} | compress | uuencode ${of} >> ${of}.uu
47 exit 1
48 */
49
50 static int
unicode_to_utf8(char * p,uint32_t uc)51 unicode_to_utf8(char *p, uint32_t uc)
52 {
53 char *_p = p;
54
55 /* Translate code point to UTF8 */
56 if (uc <= 0x7f) {
57 *p++ = (char)uc;
58 } else if (uc <= 0x7ff) {
59 *p++ = 0xc0 | ((uc >> 6) & 0x1f);
60 *p++ = 0x80 | (uc & 0x3f);
61 } else if (uc <= 0xffff) {
62 *p++ = 0xe0 | ((uc >> 12) & 0x0f);
63 *p++ = 0x80 | ((uc >> 6) & 0x3f);
64 *p++ = 0x80 | (uc & 0x3f);
65 } else {
66 *p++ = 0xf0 | ((uc >> 18) & 0x07);
67 *p++ = 0x80 | ((uc >> 12) & 0x3f);
68 *p++ = 0x80 | ((uc >> 6) & 0x3f);
69 *p++ = 0x80 | (uc & 0x3f);
70 }
71 return ((int)(p - _p));
72 }
73
74 static void
archive_be16enc(void * pp,uint16_t u)75 archive_be16enc(void *pp, uint16_t u)
76 {
77 unsigned char *p = (unsigned char *)pp;
78
79 p[0] = (u >> 8) & 0xff;
80 p[1] = u & 0xff;
81 }
82
83 static int
unicode_to_utf16be(char * p,uint32_t uc)84 unicode_to_utf16be(char *p, uint32_t uc)
85 {
86 char *utf16 = p;
87
88 if (uc > 0xffff) {
89 /* We have a code point that won't fit into a
90 * wchar_t; convert it to a surrogate pair. */
91 uc -= 0x10000;
92 archive_be16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);
93 archive_be16enc(utf16+2, (uc & 0x3ff) + 0xDC00);
94 return (4);
95 } else {
96 archive_be16enc(utf16, uc);
97 return (2);
98 }
99 }
100
101 static void
archive_le16enc(void * pp,uint16_t u)102 archive_le16enc(void *pp, uint16_t u)
103 {
104 unsigned char *p = (unsigned char *)pp;
105
106 p[0] = u & 0xff;
107 p[1] = (u >> 8) & 0xff;
108 }
109
110 static size_t
unicode_to_utf16le(char * p,uint32_t uc)111 unicode_to_utf16le(char *p, uint32_t uc)
112 {
113 char *utf16 = p;
114
115 if (uc > 0xffff) {
116 /* We have a code point that won't fit into a
117 * wchar_t; convert it to a surrogate pair. */
118 uc -= 0x10000;
119 archive_le16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);
120 archive_le16enc(utf16+2, (uc & 0x3ff) + 0xDC00);
121 return (4);
122 } else {
123 archive_le16enc(utf16, uc);
124 return (2);
125 }
126 }
127
128 static int
wc_size(void)129 wc_size(void)
130 {
131 return (sizeof(wchar_t));
132 }
133
134 static int
unicode_to_wc(wchar_t * wp,uint32_t uc)135 unicode_to_wc(wchar_t *wp, uint32_t uc)
136 {
137 if (wc_size() == 4) {
138 *wp = (wchar_t)uc;
139 return (1);
140 }
141 if (uc > 0xffff) {
142 /* We have a code point that won't fit into a
143 * wchar_t; convert it to a surrogate pair. */
144 uc -= 0x10000;
145 *wp++ = (wchar_t)(((uc >> 10) & 0x3ff) + 0xD800);
146 *wp = (wchar_t)((uc & 0x3ff) + 0xDC00);
147 return (2);
148 } else {
149 *wp = (wchar_t)uc;
150 return (1);
151 }
152 }
153
154 /*
155 * Note: U+2000 - U+2FFF, U+F900 - U+FAFF and U+2F800 - U+2FAFF are not
156 * converted to NFD on Mac OS.
157 * see also http://developer.apple.com/library/mac/#qa/qa2001/qa1173.html
158 */
159 static int
scan_unicode_pattern(char * out,wchar_t * wout,char * u16be,char * u16le,const char * pattern,int mac_nfd)160 scan_unicode_pattern(char *out, wchar_t *wout, char *u16be, char *u16le,
161 const char *pattern, int mac_nfd)
162 {
163 unsigned uc = 0;
164 const char *p = pattern;
165 char *op = out;
166 wchar_t *owp = wout;
167 char *op16be = u16be;
168 char *op16le = u16le;
169 int ret = 0;
170
171 for (;;) {
172 if (*p >= '0' && *p <= '9')
173 uc = (uc << 4) + (*p - '0');
174 else if (*p >= 'A' && *p <= 'F')
175 uc = (uc << 4) + (*p - 'A' + 0x0a);
176 else {
177 if (mac_nfd && op == out) {
178 /*
179 * These are not converted to NFD on Mac OS.
180 * U+2000 - U+2FFF
181 * U+F900 - U+FAFF
182 * U+2F800 - U+2FAFF
183 */
184 switch (uc) {
185 case 0x2194: case 0x219A: case 0x219B:
186 case 0x21AE: case 0x21CD: case 0x21CE:
187 case 0x21CF: case 0x2204: case 0x2209:
188 case 0x220C: case 0x2224: case 0x2226:
189 case 0x2241: case 0x2244: case 0x2247:
190 case 0x2249: case 0x2260: case 0x2262:
191 case 0x226D: case 0x226E: case 0x226F:
192 case 0x2270: case 0x2271: case 0x2274:
193 case 0x2275: case 0x2276: case 0x2278:
194 case 0x2279: case 0x227A: case 0x227B:
195 case 0x2280: case 0x2281: case 0x2284:
196 case 0x2285: case 0x2288: case 0x2289:
197 case 0x22AC: case 0x22AD: case 0x22AE:
198 case 0x22AF: case 0x22E0: case 0x22E1:
199 case 0x22E2: case 0x22E3: case 0x22EA:
200 case 0x22EB: case 0x22EC: case 0x22ED:
201
202 /*
203 * Those code points are not converted to
204 * NFD on Mac OS. I do not know the reason
205 * because it is undocumented.
206 * NFC NFD
207 * 1109A ==> 11099 110BA
208 * 1109C ==> 1109B 110BA
209 * 110AB ==> 110A5 110BA
210 */
211 case 0x1109A: case 0x1109C: case 0x110AB:
212 ret = 1;
213 break;
214 }
215 }
216 op16be += unicode_to_utf16be(op16be, uc);
217 op16le += unicode_to_utf16le(op16le, uc);
218 owp += unicode_to_wc(owp, uc);
219 op += unicode_to_utf8(op, uc);
220 if (!*p) {
221 *op16be++ = 0;
222 *op16be = 0;
223 *op16le++ = 0;
224 *op16le = 0;
225 *owp = L'\0';
226 *op = '\0';
227 break;
228 }
229 uc = 0;
230 }
231 p++;
232 }
233 return (ret);
234 }
235
236 static int
is_wc_unicode(void)237 is_wc_unicode(void)
238 {
239 #if defined(_WIN32) && !defined(__CYGWIN__)
240 return (1);
241 #else
242 return (0);
243 #endif
244 }
245
246 /*
247 * A conversion test that we correctly normalize UTF-8 and UTF-16BE characters.
248 * On Mac OS, the characters to be Form D.
249 * On other platforms, the characters to be Form C.
250 */
251 static void
test_archive_string_normalization_nfc(const char * testdata)252 test_archive_string_normalization_nfc(const char *testdata)
253 {
254 struct archive *a, *a2;
255 struct archive_string utf8;
256 struct archive_mstring mstr;
257 struct archive_string_conv *f_sconv8, *t_sconv8;
258 struct archive_string_conv *f_sconv16be, *f_sconv16le;
259 FILE *fp;
260 char buff[512];
261 int line = 0;
262 int locale_is_utf8, wc_is_unicode;
263 int sconv_opt = SCONV_SET_OPT_NORMALIZATION_C;
264
265 locale_is_utf8 = (NULL != setlocale(LC_ALL, "en_US.UTF-8"));
266 wc_is_unicode = is_wc_unicode();
267 /* If it doesn't exist, just warn and return. */
268 if (!locale_is_utf8 && !wc_is_unicode) {
269 skipping("A test of string normalization for NFC requires "
270 "a suitable locale; en_US.UTF-8 not available on this "
271 "system");
272 return;
273 }
274
275 archive_string_init(&utf8);
276 memset(&mstr, 0, sizeof(mstr));
277
278 /*
279 * Create string conversion objects.
280 */
281 assert((a = archive_read_new()) != NULL);
282 assertA(NULL != (f_sconv8 =
283 archive_string_conversion_from_charset(a, "UTF-8", 0)));
284 assertA(NULL != (f_sconv16be =
285 archive_string_conversion_from_charset(a, "UTF-16BE", 0)));
286 assertA(NULL != (f_sconv16le =
287 archive_string_conversion_from_charset(a, "UTF-16LE", 0)));
288 assert((a2 = archive_write_new()) != NULL);
289 assertA(NULL != (t_sconv8 =
290 archive_string_conversion_to_charset(a2, "UTF-8", 0)));
291 if (f_sconv8 == NULL || f_sconv16be == NULL || f_sconv16le == NULL ||
292 t_sconv8 == NULL) {
293 /* We cannot continue this test. */
294 assertEqualInt(ARCHIVE_OK, archive_read_free(a));
295 return;
296 }
297 archive_string_conversion_set_opt(f_sconv8, sconv_opt);
298 archive_string_conversion_set_opt(f_sconv16be, sconv_opt);
299 archive_string_conversion_set_opt(f_sconv16le, sconv_opt);
300 archive_string_conversion_set_opt(t_sconv8, sconv_opt);
301
302 /* Open a test pattern file. */
303 assert((fp = fopen(testdata, "r")) != NULL);
304
305 /*
306 * Read test data.
307 * Test data format:
308 * <NFC Unicode pattern> ';' <NFD Unicode pattern> '\n'
309 * Unicode pattern format:
310 * [0-9A-F]{4,5}([ ][0-9A-F]{4,5}){0,}
311 */
312 while (fgets(buff, sizeof(buff), fp) != NULL) {
313 char nfc[80], nfd[80];
314 char utf8_nfc[80], utf8_nfd[80];
315 char utf16be_nfc[80], utf16be_nfd[80];
316 char utf16le_nfc[80], utf16le_nfd[80];
317 wchar_t wc_nfc[40], wc_nfd[40];
318 char *e, *p;
319 const wchar_t *wp;
320 const char *mp;
321 size_t mplen;
322
323 line++;
324 if (buff[0] == '#')
325 continue;
326 p = strchr(buff, ';');
327 if (p == NULL)
328 continue;
329 *p++ = '\0';
330 /* Copy an NFC pattern */
331 strncpy(nfc, buff, sizeof(nfc)-1);
332 nfc[sizeof(nfc)-1] = '\0';
333 e = p;
334 p = strchr(p, '\n');
335 if (p == NULL)
336 continue;
337 *p = '\0';
338 /* Copy an NFD pattern */
339 strncpy(nfd, e, sizeof(nfd)-1);
340 nfd[sizeof(nfd)-1] = '\0';
341
342 /*
343 * Get an NFC patterns.
344 */
345 scan_unicode_pattern(utf8_nfc, wc_nfc, utf16be_nfc, utf16le_nfc,
346 nfc, 0);
347
348 /*
349 * Get an NFD patterns.
350 */
351 scan_unicode_pattern(utf8_nfd, wc_nfd, utf16be_nfd, utf16le_nfd,
352 nfd, 0);
353
354 if (locale_is_utf8) {
355 /*
356 * Normalize an NFD string for import.
357 */
358 assertEqualInt(0, archive_strcpy_l(
359 &utf8, utf8_nfd, f_sconv8));
360 failure("NFD(%s) should be converted to NFC(%s):%d",
361 nfd, nfc, line);
362 assertEqualUTF8String(utf8_nfc, utf8.s);
363
364 /*
365 * Normalize an NFC string for import.
366 */
367 assertEqualInt(0, archive_strcpy_l(
368 &utf8, utf8_nfc, f_sconv8));
369 failure("NFC(%s) should not be any changed:%d",
370 nfc, line);
371 assertEqualUTF8String(utf8_nfc, utf8.s);
372
373 /*
374 * Copy an NFC string for export.
375 */
376 assertEqualInt(0, archive_strcpy_l(
377 &utf8, utf8_nfc, t_sconv8));
378 failure("NFC(%s) should not be any changed:%d",
379 nfc, line);
380 assertEqualUTF8String(utf8_nfc, utf8.s);
381
382 /*
383 * Normalize an NFD string in UTF-16BE for import.
384 */
385 assertEqualInt(0, archive_strncpy_l(
386 &utf8, utf16be_nfd, 100000, f_sconv16be));
387 failure("NFD(%s) should be converted to NFC(%s):%d",
388 nfd, nfc, line);
389 assertEqualUTF8String(utf8_nfc, utf8.s);
390
391 /*
392 * Normalize an NFD string in UTF-16LE for import.
393 */
394 assertEqualInt(0, archive_strncpy_l(
395 &utf8, utf16le_nfd, 100000, f_sconv16le));
396 failure("NFD(%s) should be converted to NFC(%s):%d",
397 nfd, nfc, line);
398 assertEqualUTF8String(utf8_nfc, utf8.s);
399 }
400
401 /*
402 * Test for archive_mstring interface.
403 * In specific, Windows platform UTF-16BE is directly
404 * converted to/from wide-character to avoid the effect of
405 * current locale since windows platform cannot make
406 * locale UTF-8.
407 */
408 if (locale_is_utf8 || wc_is_unicode) {
409 /*
410 * Normalize an NFD string in UTF-8 for import.
411 */
412 assertEqualInt(0, archive_mstring_copy_mbs_len_l(
413 &mstr, utf8_nfd, 100000, f_sconv8));
414 assertEqualInt(0,
415 archive_mstring_get_wcs(a, &mstr, &wp));
416 failure("UTF-8 NFD(%s) should be converted "
417 "to WCS NFC(%s):%d", nfd, nfc, line);
418 assertEqualWString(wc_nfc, wp);
419
420 /*
421 * Normalize an NFD string in UTF-16BE for import.
422 */
423 assertEqualInt(0, archive_mstring_copy_mbs_len_l(
424 &mstr, utf16be_nfd, 100000, f_sconv16be));
425 assertEqualInt(0,
426 archive_mstring_get_wcs(a, &mstr, &wp));
427 failure("UTF-8 NFD(%s) should be converted "
428 "to WCS NFC(%s):%d", nfd, nfc, line);
429 assertEqualWString(wc_nfc, wp);
430
431 /*
432 * Normalize an NFD string in UTF-16LE for import.
433 */
434 assertEqualInt(0, archive_mstring_copy_mbs_len_l(
435 &mstr, utf16le_nfd, 100000, f_sconv16le));
436 assertEqualInt(0,
437 archive_mstring_get_wcs(a, &mstr, &wp));
438 failure("UTF-8 NFD(%s) should be converted "
439 "to WCS NFC(%s):%d", nfd, nfc, line);
440 assertEqualWString(wc_nfc, wp);
441
442 /*
443 * Copy an NFC wide-string for export.
444 */
445 assertEqualInt(0,
446 archive_mstring_copy_wcs(&mstr, wc_nfc));
447 assertEqualInt(0, archive_mstring_get_mbs_l(
448 a, &mstr, &mp, &mplen, t_sconv8));
449 failure("WCS NFC(%s) should be UTF-8 NFC:%d"
450 ,nfc, line);
451 assertEqualUTF8String(utf8_nfc, mp);
452 }
453 }
454
455 archive_string_free(&utf8);
456 archive_mstring_clean(&mstr);
457 fclose(fp);
458 assertEqualInt(ARCHIVE_OK, archive_read_free(a));
459 assertEqualInt(ARCHIVE_OK, archive_write_free(a2));
460 }
461
462 static void
test_archive_string_normalization_mac_nfd(const char * testdata)463 test_archive_string_normalization_mac_nfd(const char *testdata)
464 {
465 struct archive *a, *a2;
466 struct archive_string utf8;
467 struct archive_mstring mstr;
468 struct archive_string_conv *f_sconv8, *t_sconv8;
469 struct archive_string_conv *f_sconv16be, *f_sconv16le;
470 FILE *fp;
471 char buff[512];
472 int line = 0;
473 int locale_is_utf8, wc_is_unicode;
474 int sconv_opt = SCONV_SET_OPT_NORMALIZATION_D;
475
476 locale_is_utf8 = (NULL != setlocale(LC_ALL, "en_US.UTF-8"));
477 wc_is_unicode = is_wc_unicode();
478 /* If it doesn't exist, just warn and return. */
479 if (!locale_is_utf8 && !wc_is_unicode) {
480 skipping("A test of string normalization for NFD requires "
481 "a suitable locale; en_US.UTF-8 not available on this "
482 "system");
483 return;
484 }
485
486 archive_string_init(&utf8);
487 memset(&mstr, 0, sizeof(mstr));
488
489 /*
490 * Create string conversion objects.
491 */
492 assert((a = archive_read_new()) != NULL);
493 assertA(NULL != (f_sconv8 =
494 archive_string_conversion_from_charset(a, "UTF-8", 0)));
495 assertA(NULL != (f_sconv16be =
496 archive_string_conversion_from_charset(a, "UTF-16BE", 0)));
497 assertA(NULL != (f_sconv16le =
498 archive_string_conversion_from_charset(a, "UTF-16LE", 0)));
499 assert((a2 = archive_write_new()) != NULL);
500 assertA(NULL != (t_sconv8 =
501 archive_string_conversion_to_charset(a2, "UTF-8", 0)));
502 if (f_sconv8 == NULL || f_sconv16be == NULL || f_sconv16le == NULL ||
503 t_sconv8 == NULL) {
504 /* We cannot continue this test. */
505 assertEqualInt(ARCHIVE_OK, archive_read_free(a));
506 return;
507 }
508 archive_string_conversion_set_opt(f_sconv8, sconv_opt);
509 archive_string_conversion_set_opt(f_sconv16be, sconv_opt);
510 archive_string_conversion_set_opt(f_sconv16le, sconv_opt);
511 archive_string_conversion_set_opt(t_sconv8, sconv_opt);
512
513 /* Open a test pattern file. */
514 assert((fp = fopen(testdata, "r")) != NULL);
515
516 /*
517 * Read test data.
518 * Test data format:
519 * <NFC Unicode pattern> ';' <NFD Unicode pattern> '\n'
520 * Unicode pattern format:
521 * [0-9A-F]{4,5}([ ][0-9A-F]{4,5}){0,}
522 */
523 while (fgets(buff, sizeof(buff), fp) != NULL) {
524 char nfc[80], nfd[80];
525 char utf8_nfc[80], utf8_nfd[80];
526 char utf16be_nfc[80], utf16be_nfd[80];
527 char utf16le_nfc[80], utf16le_nfd[80];
528 wchar_t wc_nfc[40], wc_nfd[40];
529 char *e, *p;
530 const wchar_t *wp;
531 const char *mp;
532 size_t mplen;
533 int should_be_nfc;
534
535 line++;
536 if (buff[0] == '#')
537 continue;
538 p = strchr(buff, ';');
539 if (p == NULL)
540 continue;
541 *p++ = '\0';
542 /* Copy an NFC pattern */
543 strncpy(nfc, buff, sizeof(nfc)-1);
544 nfc[sizeof(nfc)-1] = '\0';
545 e = p;
546 p = strchr(p, '\n');
547 if (p == NULL)
548 continue;
549 *p = '\0';
550 /* Copy an NFD pattern */
551 strncpy(nfd, e, sizeof(nfd)-1);
552 nfd[sizeof(nfd)-1] = '\0';
553
554 /*
555 * Get an NFC patterns.
556 */
557 should_be_nfc = scan_unicode_pattern(utf8_nfc, wc_nfc,
558 utf16be_nfc, utf16le_nfc, nfc, 1);
559
560 /*
561 * Get an NFD patterns.
562 */
563 scan_unicode_pattern(utf8_nfd, wc_nfd, utf16be_nfd, utf16le_nfd,
564 nfd, 0);
565
566 if (locale_is_utf8) {
567 /*
568 * Normalize an NFC string for import.
569 */
570 assertEqualInt(0, archive_strcpy_l(
571 &utf8, utf8_nfc, f_sconv8));
572 if (should_be_nfc) {
573 failure("NFC(%s) should not be converted to"
574 " NFD(%s):%d", nfc, nfd, line);
575 assertEqualUTF8String(utf8_nfc, utf8.s);
576 } else {
577 failure("NFC(%s) should be converted to"
578 " NFD(%s):%d", nfc, nfd, line);
579 assertEqualUTF8String(utf8_nfd, utf8.s);
580 }
581
582 /*
583 * Normalize an NFD string for import.
584 */
585 assertEqualInt(0, archive_strcpy_l(
586 &utf8, utf8_nfd, f_sconv8));
587 failure("NFD(%s) should not be any changed:%d",
588 nfd, line);
589 assertEqualUTF8String(utf8_nfd, utf8.s);
590
591 /*
592 * Copy an NFD string for export.
593 */
594 assertEqualInt(0, archive_strcpy_l(
595 &utf8, utf8_nfd, t_sconv8));
596 failure("NFD(%s) should not be any changed:%d",
597 nfd, line);
598 assertEqualUTF8String(utf8_nfd, utf8.s);
599
600 /*
601 * Normalize an NFC string in UTF-16BE for import.
602 */
603 assertEqualInt(0, archive_strncpy_l(
604 &utf8, utf16be_nfc, 100000, f_sconv16be));
605 if (should_be_nfc) {
606 failure("NFC(%s) should not be converted to"
607 " NFD(%s):%d", nfc, nfd, line);
608 assertEqualUTF8String(utf8_nfc, utf8.s);
609 } else {
610 failure("NFC(%s) should be converted to"
611 " NFD(%s):%d", nfc, nfd, line);
612 assertEqualUTF8String(utf8_nfd, utf8.s);
613 }
614
615 /*
616 * Normalize an NFC string in UTF-16LE for import.
617 */
618 assertEqualInt(0, archive_strncpy_l(
619 &utf8, utf16le_nfc, 100000, f_sconv16le));
620 if (should_be_nfc) {
621 failure("NFC(%s) should not be converted to"
622 " NFD(%s):%d", nfc, nfd, line);
623 assertEqualUTF8String(utf8_nfc, utf8.s);
624 } else {
625 failure("NFC(%s) should be converted to"
626 " NFD(%s):%d", nfc, nfd, line);
627 assertEqualUTF8String(utf8_nfd, utf8.s);
628 }
629 }
630
631 /*
632 * Test for archive_mstring interface.
633 * In specific, Windows platform UTF-16BE is directly
634 * converted to/from wide-character to avoid the effect of
635 * current locale since windows platform cannot make
636 * locale UTF-8.
637 */
638 if (locale_is_utf8 || wc_is_unicode) {
639 /*
640 * Normalize an NFD string in UTF-8 for import.
641 */
642 assertEqualInt(0, archive_mstring_copy_mbs_len_l(
643 &mstr, utf8_nfc, 100000, f_sconv8));
644 assertEqualInt(0,
645 archive_mstring_get_wcs(a, &mstr, &wp));
646 if (should_be_nfc) {
647 failure("UTF-8 NFC(%s) should not be converted "
648 "to WCS NFD(%s):%d", nfc, nfd, line);
649 assertEqualWString(wc_nfc, wp);
650 } else {
651 failure("UTF-8 NFC(%s) should be converted "
652 "to WCS NFD(%s):%d", nfc, nfd, line);
653 assertEqualWString(wc_nfd, wp);
654 }
655
656 /*
657 * Normalize an NFD string in UTF-16BE for import.
658 */
659 assertEqualInt(0, archive_mstring_copy_mbs_len_l(
660 &mstr, utf16be_nfc, 100000, f_sconv16be));
661 assertEqualInt(0,
662 archive_mstring_get_wcs(a, &mstr, &wp));
663 if (should_be_nfc) {
664 failure("UTF-16BE NFC(%s) should not be "
665 "converted to WCS NFD(%s):%d",
666 nfc, nfd, line);
667 assertEqualWString(wc_nfc, wp);
668 } else {
669 failure("UTF-16BE NFC(%s) should be converted "
670 "to WCS NFD(%s):%d", nfc, nfd, line);
671 assertEqualWString(wc_nfd, wp);
672 }
673
674 /*
675 * Normalize an NFD string in UTF-16LE for import.
676 */
677 assertEqualInt(0, archive_mstring_copy_mbs_len_l(
678 &mstr, utf16le_nfc, 100000, f_sconv16le));
679 assertEqualInt(0,
680 archive_mstring_get_wcs(a, &mstr, &wp));
681 if (should_be_nfc) {
682 failure("UTF-16LE NFC(%s) should not be "
683 "converted to WCS NFD(%s):%d",
684 nfc, nfd, line);
685 assertEqualWString(wc_nfc, wp);
686 } else {
687 failure("UTF-16LE NFC(%s) should be converted "
688 "to WCS NFD(%s):%d", nfc, nfd, line);
689 assertEqualWString(wc_nfd, wp);
690 }
691
692 /*
693 * Copy an NFD wide-string for export.
694 */
695 assertEqualInt(0, archive_mstring_copy_wcs(
696 &mstr, wc_nfd));
697 assertEqualInt(0, archive_mstring_get_mbs_l(
698 a, &mstr, &mp, &mplen, t_sconv8));
699 failure("WCS NFD(%s) should be UTF-8 NFD:%d"
700 ,nfd, line);
701 assertEqualUTF8String(utf8_nfd, mp);
702 }
703 }
704
705 archive_string_free(&utf8);
706 archive_mstring_clean(&mstr);
707 fclose(fp);
708 assertEqualInt(ARCHIVE_OK, archive_read_free(a));
709 assertEqualInt(ARCHIVE_OK, archive_write_free(a2));
710 }
711
712 static void
test_archive_string_canonicalization(void)713 test_archive_string_canonicalization(void)
714 {
715 struct archive *a;
716 struct archive_string_conv *sconv;
717
718 setlocale(LC_ALL, "en_US.UTF-8");
719
720 assert((a = archive_read_new()) != NULL);
721
722 assertA(NULL != (sconv =
723 archive_string_conversion_to_charset(a, "UTF-8", 1)));
724 failure("Charset name should be UTF-8");
725 assertEqualString("UTF-8",
726 archive_string_conversion_charset_name(sconv));
727
728 assertA(NULL != (sconv =
729 archive_string_conversion_to_charset(a, "UTF8", 1)));
730 failure("Charset name should be UTF-8");
731 assertEqualString("UTF-8",
732 archive_string_conversion_charset_name(sconv));
733
734 assertA(NULL != (sconv =
735 archive_string_conversion_to_charset(a, "utf8", 1)));
736 failure("Charset name should be UTF-8");
737 assertEqualString("UTF-8",
738 archive_string_conversion_charset_name(sconv));
739
740 assertA(NULL != (sconv =
741 archive_string_conversion_to_charset(a, "UTF-16BE", 1)));
742 failure("Charset name should be UTF-16BE");
743 assertEqualString("UTF-16BE",
744 archive_string_conversion_charset_name(sconv));
745
746 assertA(NULL != (sconv =
747 archive_string_conversion_to_charset(a, "UTF16BE", 1)));
748 failure("Charset name should be UTF-16BE");
749 assertEqualString("UTF-16BE",
750 archive_string_conversion_charset_name(sconv));
751
752 assertA(NULL != (sconv =
753 archive_string_conversion_to_charset(a, "utf16be", 1)));
754 failure("Charset name should be UTF-16BE");
755 assertEqualString("UTF-16BE",
756 archive_string_conversion_charset_name(sconv));
757
758 assertA(NULL != (sconv =
759 archive_string_conversion_to_charset(a, "UTF-16LE", 1)));
760 failure("Charset name should be UTF-16LE");
761 assertEqualString("UTF-16LE",
762 archive_string_conversion_charset_name(sconv));
763
764 assertA(NULL != (sconv =
765 archive_string_conversion_to_charset(a, "UTF16LE", 1)));
766 failure("Charset name should be UTF-16LE");
767 assertEqualString("UTF-16LE",
768 archive_string_conversion_charset_name(sconv));
769
770 assertA(NULL != (sconv =
771 archive_string_conversion_to_charset(a, "utf16le", 1)));
772 failure("Charset name should be UTF-16LE");
773 assertEqualString("UTF-16LE",
774 archive_string_conversion_charset_name(sconv));
775
776 assertEqualInt(ARCHIVE_OK, archive_read_free(a));
777
778 }
779
780 static void
check_string(struct archive * a,struct archive_mstring * mstr,struct archive_string_conv * sc,const char * exp,const wchar_t * wexp)781 check_string(struct archive *a, struct archive_mstring *mstr, struct archive_string_conv *sc,
782 const char *exp, const wchar_t *wexp)
783 {
784 /* Do all the tests on a copy so that we can have a clear initial state every time */
785 struct archive_mstring mstr2;
786 const char *p = NULL;
787 const wchar_t *wp = NULL;
788 size_t len = 0;
789
790 memset(&mstr2, 0, sizeof(mstr2));
791
792 archive_mstring_copy(&mstr2, mstr);
793 assertEqualInt(0, archive_mstring_get_mbs(a, &mstr2, &p));
794 assertEqualString(exp, p);
795 p = NULL;
796
797 archive_mstring_copy(&mstr2, mstr);
798 assertEqualInt(0, archive_mstring_get_utf8(a, &mstr2, &p));
799 assertEqualString(exp, p);
800 p = NULL;
801
802 archive_mstring_copy(&mstr2, mstr);
803 assertEqualInt(0, archive_mstring_get_wcs(a, &mstr2, &wp));
804 assertEqualWString(wexp, wp);
805 wp = NULL;
806
807 archive_mstring_copy(&mstr2, mstr);
808 assertEqualInt(0, archive_mstring_get_mbs_l(a, &mstr2, &p, &len, sc));
809 assertEqualString(exp, p);
810 assertEqualInt(len, strlen(exp));
811 p = NULL;
812 len = 0;
813
814 archive_mstring_clean(&mstr2);
815 }
816
817 /*
818 * Make sure no matter what the input encoding is, the string can be
819 * converted too all the output encodings.
820 */
821 static void
test_archive_string_set_get(void)822 test_archive_string_set_get(void)
823 {
824 struct archive *a;
825 struct archive_mstring mstr;
826 struct archive_string_conv *sc;
827
828 setlocale(LC_ALL, "en_US.UTF-8");
829
830 assert((a = archive_read_new()) != NULL);
831 memset(&mstr, 0, sizeof(mstr));
832
833 assertA(NULL != (sc =
834 archive_string_conversion_to_charset(a, "UTF-8", 1)));
835 failure("Charset name should be UTF-8");
836 assertEqualString("UTF-8",
837 archive_string_conversion_charset_name(sc));
838
839 assertEqualInt(0, archive_mstring_copy_mbs(&mstr, "AAA"));
840 check_string(a, &mstr, sc, "AAA", L"AAA");
841 assertEqualInt(4, archive_mstring_copy_utf8(&mstr, "BBBB"));
842 check_string(a, &mstr, sc, "BBBB", L"BBBB");
843 assertEqualInt(0, archive_mstring_copy_wcs(&mstr, L"CCC12"));
844 check_string(a, &mstr, sc, "CCC12", L"CCC12");
845 assertEqualInt(0, archive_mstring_copy_mbs_len_l(&mstr, "DDDD-l", 6, sc));
846 check_string(a, &mstr, sc, "DDDD-l", L"DDDD-l");
847 assertEqualInt(0, archive_mstring_update_utf8(a, &mstr, "EEEEE---H"));
848 check_string(a, &mstr, sc, "EEEEE---H", L"EEEEE---H");
849
850 assertEqualInt(ARCHIVE_OK, archive_read_free(a));
851
852 }
853
DEFINE_TEST(test_archive_string_conversion)854 DEFINE_TEST(test_archive_string_conversion)
855 {
856 static const char reffile[] = "test_archive_string_conversion.txt.Z";
857 static const char testdata[] = "testdata.txt";
858 struct archive *a;
859 struct archive_entry *ae;
860 char buff[512];
861 ssize_t size;
862 FILE *fp;
863
864 /*
865 * Extract a test pattern file.
866 */
867 extract_reference_file(reffile);
868 assert((a = archive_read_new()) != NULL);
869 assertEqualIntA(a, ARCHIVE_OK, archive_read_support_filter_all(a));
870 assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_raw(a));
871 assertEqualIntA(a, ARCHIVE_OK,
872 archive_read_open_filename(a, reffile, 512));
873
874 assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae));
875 assert((fp = fopen(testdata, "w")) != NULL);
876 while ((size = archive_read_data(a, buff, 512)) > 0)
877 assertEqualInt(size, fwrite(buff, 1, size, fp));
878 assertEqualInt(0, fclose(fp));
879 assertEqualInt(ARCHIVE_OK, archive_read_free(a));
880
881 test_archive_string_normalization_nfc(testdata);
882 test_archive_string_normalization_mac_nfd(testdata);
883 test_archive_string_canonicalization();
884 test_archive_string_set_get();
885 }
886