xref: /vim-8.2.3635/src/os_mac_conv.c (revision cf2d8dee)
1 /* vi:set ts=8 sts=4 sw=4:
2  *
3  * VIM - Vi IMproved	by Bram Moolenaar
4  *
5  * Do ":help uganda"  in Vim to read copying and usage conditions.
6  * Do ":help credits" in Vim to see a list of people who contributed.
7  * See README.txt for an overview of the Vim source code.
8  */
9 /*
10  * os_mac_conv.c: Code specifically for Mac string conversions.
11  *
12  * This code has been put in a separate file to avoid the conflicts that are
13  * caused by including both the X11 and Carbon header files.
14  */
15 
16 #define NO_X11_INCLUDES
17 #define BalloonEval int   /* used in header files */
18 
19 #include "vim.h"
20 
21 #if !defined(FEAT_GUI_MAC) && !defined(PROTO)
22 # include <CoreServices/CoreServices.h>
23 #endif
24 
25 
26 #if defined(MACOS_CONVERT) || defined(PROTO)
27 
28 # ifdef PROTO
29 /* A few dummy types to be able to generate function prototypes. */
30 typedef int UniChar;
31 typedef int *TECObjectRef;
32 typedef int CFStringRef;
33 # endif
34 
35 static char_u	    *mac_utf16_to_utf8(UniChar *from, size_t fromLen, size_t *actualLen);
36 static UniChar	    *mac_utf8_to_utf16(char_u *from, size_t fromLen, size_t *actualLen);
37 
38 /* Converter for composing decomposed HFS+ file paths */
39 static TECObjectRef gPathConverter;
40 /* Converter used by mac_utf16_to_utf8 */
41 static TECObjectRef gUTF16ToUTF8Converter;
42 
43 /*
44  * A Mac version of string_convert_ext() for special cases.
45  */
46     char_u *
47 mac_string_convert(
48     char_u		*ptr,
49     int			len,
50     int			*lenp,
51     int			fail_on_error,
52     int			from_enc,
53     int			to_enc,
54     int			*unconvlenp)
55 {
56     char_u		*retval, *d;
57     CFStringRef		cfstr;
58     int			buflen, in, out, l, i;
59     CFStringEncoding	from;
60     CFStringEncoding	to;
61 
62     switch (from_enc)
63     {
64 	case 'l':   from = kCFStringEncodingISOLatin1; break;
65 	case 'm':   from = kCFStringEncodingMacRoman; break;
66 	case 'u':   from = kCFStringEncodingUTF8; break;
67 	default:    return NULL;
68     }
69     switch (to_enc)
70     {
71 	case 'l':   to = kCFStringEncodingISOLatin1; break;
72 	case 'm':   to = kCFStringEncodingMacRoman; break;
73 	case 'u':   to = kCFStringEncodingUTF8; break;
74 	default:    return NULL;
75     }
76 
77     if (unconvlenp != NULL)
78 	*unconvlenp = 0;
79     cfstr = CFStringCreateWithBytes(NULL, ptr, len, from, 0);
80 
81     if (cfstr == NULL)
82 	fprintf(stderr, "Encoding failed\n");
83     /* When conversion failed, try excluding bytes from the end, helps when
84      * there is an incomplete byte sequence.  Only do up to 6 bytes to avoid
85      * looping a long time when there really is something unconvertible. */
86     while (cfstr == NULL && unconvlenp != NULL && len > 1 && *unconvlenp < 6)
87     {
88 	--len;
89 	++*unconvlenp;
90 	cfstr = CFStringCreateWithBytes(NULL, ptr, len, from, 0);
91     }
92     if (cfstr == NULL)
93 	return NULL;
94 
95     if (to == kCFStringEncodingUTF8)
96 	buflen = len * 6 + 1;
97     else
98 	buflen = len + 1;
99     retval = alloc(buflen);
100     if (retval == NULL)
101     {
102 	CFRelease(cfstr);
103 	return NULL;
104     }
105 
106 #if 0
107     CFRange convertRange = CFRangeMake(0, CFStringGetLength(cfstr));
108     /*  Determine output buffer size */
109     CFStringGetBytes(cfstr, convertRange, to, NULL, FALSE, NULL, 0, (CFIndex *)&buflen);
110     retval = (buflen > 0) ? alloc(buflen) : NULL;
111     if (retval == NULL) {
112 	CFRelease(cfstr);
113 	return NULL;
114     }
115 
116     if (lenp)
117 	*lenp = buflen / sizeof(char_u);
118 
119     if (!CFStringGetBytes(cfstr, convertRange, to, NULL, FALSE, retval, buflen, NULL))
120 #endif
121     if (!CFStringGetCString(cfstr, (char *)retval, buflen, to))
122     {
123 	CFRelease(cfstr);
124 	if (fail_on_error)
125 	{
126 	    vim_free(retval);
127 	    return NULL;
128 	}
129 
130 	fprintf(stderr, "Trying char-by-char conversion...\n");
131 	/* conversion failed for the whole string, but maybe it will work
132 	 * for each character */
133 	for (d = retval, in = 0, out = 0; in < len && out < buflen - 1;)
134 	{
135 	    if (from == kCFStringEncodingUTF8)
136 		l = utf_ptr2len(ptr + in);
137 	    else
138 		l = 1;
139 	    cfstr = CFStringCreateWithBytes(NULL, ptr + in, l, from, 0);
140 	    if (cfstr == NULL)
141 	    {
142 		*d++ = '?';
143 		out++;
144 	    }
145 	    else
146 	    {
147 		if (!CFStringGetCString(cfstr, (char *)d, buflen - out, to))
148 		{
149 		    *d++ = '?';
150 		    out++;
151 		}
152 		else
153 		{
154 		    i = STRLEN(d);
155 		    d += i;
156 		    out += i;
157 		}
158 		CFRelease(cfstr);
159 	    }
160 	    in += l;
161 	}
162 	*d = NUL;
163 	if (lenp != NULL)
164 	    *lenp = out;
165 	return retval;
166     }
167     CFRelease(cfstr);
168     if (lenp != NULL)
169 	*lenp = STRLEN(retval);
170 
171     return retval;
172 }
173 
174 /*
175  * Conversion from Apple MacRoman char encoding to UTF-8 or latin1, using
176  * standard Carbon framework.
177  * Input: "ptr[*sizep]".
178  * "real_size" is the size of the buffer that "ptr" points to.
179  * output is in-place, "sizep" is adjusted.
180  * Returns OK or FAIL.
181  */
182     int
183 macroman2enc(
184     char_u	*ptr,
185     long	*sizep,
186     long	real_size)
187 {
188     CFStringRef		cfstr;
189     CFRange		r;
190     CFIndex		len = *sizep;
191 
192     /* MacRoman is an 8-bit encoding, no need to move bytes to
193      * conv_rest[]. */
194     cfstr = CFStringCreateWithBytes(NULL, ptr, len,
195 						kCFStringEncodingMacRoman, 0);
196     /*
197      * If there is a conversion error, try using another
198      * conversion.
199      */
200     if (cfstr == NULL)
201 	return FAIL;
202 
203     r.location = 0;
204     r.length = CFStringGetLength(cfstr);
205     if (r.length != CFStringGetBytes(cfstr, r,
206 	    (enc_utf8) ? kCFStringEncodingUTF8 : kCFStringEncodingISOLatin1,
207 	    0, /* no lossy conversion */
208 	    0, /* not external representation */
209 	    ptr + *sizep, real_size - *sizep, &len))
210     {
211 	CFRelease(cfstr);
212 	return FAIL;
213     }
214     CFRelease(cfstr);
215     mch_memmove(ptr, ptr + *sizep, len);
216     *sizep = len;
217 
218     return OK;
219 }
220 
221 /*
222  * Conversion from UTF-8 or latin1 to MacRoman.
223  * Input: "from[fromlen]"
224  * Output: "to[maxtolen]" length in "*tolenp"
225  * Unconverted rest in rest[*restlenp].
226  * Returns OK or FAIL.
227  */
228     int
229 enc2macroman(
230     char_u	*from,
231     size_t	fromlen,
232     char_u	*to,
233     int		*tolenp,
234     int		maxtolen,
235     char_u	*rest,
236     int		*restlenp)
237 {
238     CFStringRef	cfstr;
239     CFRange	r;
240     CFIndex	l;
241 
242     *restlenp = 0;
243     cfstr = CFStringCreateWithBytes(NULL, from, fromlen,
244 	    (enc_utf8) ? kCFStringEncodingUTF8 : kCFStringEncodingISOLatin1,
245 	    0);
246     while (cfstr == NULL && *restlenp < 3 && fromlen > 1)
247     {
248 	rest[*restlenp++] = from[--fromlen];
249 	cfstr = CFStringCreateWithBytes(NULL, from, fromlen,
250 		(enc_utf8) ? kCFStringEncodingUTF8 : kCFStringEncodingISOLatin1,
251 		0);
252     }
253     if (cfstr == NULL)
254 	return FAIL;
255 
256     r.location = 0;
257     r.length = CFStringGetLength(cfstr);
258     if (r.length != CFStringGetBytes(cfstr, r,
259 		kCFStringEncodingMacRoman,
260 		0, /* no lossy conversion */
261 		0, /* not external representation (since vim
262 		    * handles this internally */
263 		to, maxtolen, &l))
264     {
265 	CFRelease(cfstr);
266 	return FAIL;
267     }
268     CFRelease(cfstr);
269     *tolenp = l;
270     return OK;
271 }
272 
273 /*
274  * Initializes text converters
275  */
276     void
277 mac_conv_init(void)
278 {
279     TextEncoding    utf8_encoding;
280     TextEncoding    utf8_hfsplus_encoding;
281     TextEncoding    utf8_canon_encoding;
282     TextEncoding    utf16_encoding;
283 
284     utf8_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
285 	    kTextEncodingDefaultVariant, kUnicodeUTF8Format);
286     utf8_hfsplus_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
287 	    kUnicodeHFSPlusCompVariant, kUnicodeUTF8Format);
288     utf8_canon_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
289 	    kUnicodeCanonicalCompVariant, kUnicodeUTF8Format);
290     utf16_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
291 	    kTextEncodingDefaultVariant, kUnicode16BitFormat);
292 
293     if (TECCreateConverter(&gPathConverter, utf8_encoding,
294 		utf8_hfsplus_encoding) != noErr)
295 	gPathConverter = NULL;
296 
297     if (TECCreateConverter(&gUTF16ToUTF8Converter, utf16_encoding,
298 		utf8_canon_encoding) != noErr)
299     {
300 	/* On pre-10.3, Unicode normalization is not available so
301 	 * fall back to non-normalizing converter */
302 	if (TECCreateConverter(&gUTF16ToUTF8Converter, utf16_encoding,
303 		    utf8_encoding) != noErr)
304 	    gUTF16ToUTF8Converter = NULL;
305     }
306 }
307 
308 /*
309  * Destroys text converters
310  */
311     void
312 mac_conv_cleanup(void)
313 {
314     if (gUTF16ToUTF8Converter)
315     {
316 	TECDisposeConverter(gUTF16ToUTF8Converter);
317 	gUTF16ToUTF8Converter = NULL;
318     }
319 
320     if (gPathConverter)
321     {
322 	TECDisposeConverter(gPathConverter);
323 	gPathConverter = NULL;
324     }
325 }
326 
327 /*
328  * Conversion from UTF-16 UniChars to 'encoding'
329  * The function signature uses the real type of UniChar (as typedef'ed in
330  * CFBase.h) to avoid clashes with X11 header files in the .pro file
331  */
332     char_u *
333 mac_utf16_to_enc(
334     unsigned short *from,
335     size_t fromLen,
336     size_t *actualLen)
337 {
338     /* Following code borrows somewhat from os_mswin.c */
339     vimconv_T	conv;
340     size_t      utf8_len;
341     char_u      *utf8_str;
342     char_u      *result = NULL;
343 
344     /* Convert to utf-8 first, works better with iconv */
345     utf8_len = 0;
346     utf8_str = mac_utf16_to_utf8(from, fromLen, &utf8_len);
347 
348     if (utf8_str)
349     {
350 	/* We might be called before we have p_enc set up. */
351 	conv.vc_type = CONV_NONE;
352 
353 	/* If encoding (p_enc) is any unicode, it is actually in utf-8 (vim
354 	 * internal unicode is always utf-8) so don't convert in such cases */
355 
356 	if ((enc_canon_props(p_enc) & ENC_UNICODE) == 0)
357 	    convert_setup(&conv, (char_u *)"utf-8",
358 		    p_enc? p_enc: (char_u *)"macroman");
359 	if (conv.vc_type == CONV_NONE)
360 	{
361 	    /* p_enc is utf-8, so we're done. */
362 	    result = utf8_str;
363 	}
364 	else
365 	{
366 	    result = string_convert(&conv, utf8_str, (int *)&utf8_len);
367 	    vim_free(utf8_str);
368 	}
369 
370 	convert_setup(&conv, NULL, NULL);
371 
372 	if (actualLen)
373 	    *actualLen = utf8_len;
374     }
375     else if (actualLen)
376 	*actualLen = 0;
377 
378     return result;
379 }
380 
381 /*
382  * Conversion from 'encoding' to UTF-16 UniChars
383  * The function return uses the real type of UniChar (as typedef'ed in
384  * CFBase.h) to avoid clashes with X11 header files in the .pro file
385  */
386     unsigned short *
387 mac_enc_to_utf16(
388     char_u *from,
389     size_t fromLen,
390     size_t *actualLen)
391 {
392     /* Following code borrows somewhat from os_mswin.c */
393     vimconv_T	conv;
394     size_t      utf8_len;
395     char_u      *utf8_str;
396     UniChar     *result = NULL;
397     Boolean     should_free_utf8 = FALSE;
398 
399     do
400     {
401 	/* Use MacRoman by default, we might be called before we have p_enc
402 	 * set up.  Convert to utf-8 first, works better with iconv().  Does
403 	 * nothing if 'encoding' is "utf-8". */
404 	conv.vc_type = CONV_NONE;
405 	if ((enc_canon_props(p_enc) & ENC_UNICODE) == 0 &&
406 		convert_setup(&conv, p_enc ? p_enc : (char_u *)"macroman",
407 		    (char_u *)"utf-8") == FAIL)
408 	    break;
409 
410 	if (conv.vc_type != CONV_NONE)
411 	{
412 	    utf8_len = fromLen;
413 	    utf8_str = string_convert(&conv, from, (int *)&utf8_len);
414 	    should_free_utf8 = TRUE;
415 	}
416 	else
417 	{
418 	    utf8_str = from;
419 	    utf8_len = fromLen;
420 	}
421 
422 	if (utf8_str == NULL)
423 	    break;
424 
425 	convert_setup(&conv, NULL, NULL);
426 
427 	result = mac_utf8_to_utf16(utf8_str, utf8_len, actualLen);
428 
429 	if (should_free_utf8)
430 	    vim_free(utf8_str);
431 	return result;
432     }
433     while (0);
434 
435     if (actualLen)
436 	*actualLen = 0;
437 
438     return result;
439 }
440 
441 /*
442  * Converts from UTF-16 UniChars to CFString
443  * The void * return type is actually a CFStringRef
444  */
445     void *
446 mac_enc_to_cfstring(
447     char_u  *from,
448     size_t  fromLen)
449 {
450     UniChar	*utf16_str;
451     size_t	utf16_len;
452     CFStringRef	result = NULL;
453 
454     utf16_str = mac_enc_to_utf16(from, fromLen, &utf16_len);
455     if (utf16_str)
456     {
457 	result = CFStringCreateWithCharacters(NULL, utf16_str, utf16_len/sizeof(UniChar));
458 	vim_free(utf16_str);
459     }
460 
461     return (void *)result;
462 }
463 
464 /*
465  * Converts a decomposed HFS+ UTF-8 path to precomposed UTF-8
466  */
467     char_u *
468 mac_precompose_path(
469     char_u  *decompPath,
470     size_t  decompLen,
471     size_t  *precompLen)
472 {
473     char_u  *result = NULL;
474     size_t  actualLen = 0;
475 
476     if (gPathConverter)
477     {
478 	result = alloc(decompLen);
479 	if (result)
480 	{
481 	    if (TECConvertText(gPathConverter, decompPath,
482 			decompLen, &decompLen, result,
483 			decompLen, &actualLen) != noErr)
484 	    {
485 		vim_free(result);
486 		result = NULL;
487 	    }
488 	}
489     }
490 
491     if (precompLen)
492 	*precompLen = actualLen;
493 
494     return result;
495 }
496 
497 /*
498  * Converts from UTF-16 UniChars to precomposed UTF-8
499  */
500     static char_u *
501 mac_utf16_to_utf8(
502     UniChar *from,
503     size_t fromLen,
504     size_t *actualLen)
505 {
506     ByteCount		utf8_len;
507     ByteCount		inputRead;
508     char_u		*result;
509 
510     if (gUTF16ToUTF8Converter)
511     {
512 	result = alloc(fromLen * 6 + 1);
513 	if (result && TECConvertText(gUTF16ToUTF8Converter, (ConstTextPtr)from,
514 		    fromLen, &inputRead, result,
515 		    (fromLen*6+1)*sizeof(char_u), &utf8_len) == noErr)
516 	{
517 	    TECFlushText(gUTF16ToUTF8Converter, result, (fromLen*6+1)*sizeof(char_u), &inputRead);
518 	    utf8_len += inputRead;
519 	}
520 	else
521 	{
522 	    vim_free(result);
523 	    result = NULL;
524 	}
525     }
526     else
527     {
528 	result = NULL;
529     }
530 
531     if (actualLen)
532 	*actualLen = result ? utf8_len : 0;
533 
534     return result;
535 }
536 
537 /*
538  * Converts from UTF-8 to UTF-16 UniChars
539  */
540     static UniChar *
541 mac_utf8_to_utf16(
542     char_u *from,
543     size_t fromLen,
544     size_t *actualLen)
545 {
546     CFStringRef  utf8_str;
547     CFRange      convertRange;
548     UniChar      *result = NULL;
549 
550     utf8_str = CFStringCreateWithBytes(NULL, from, fromLen,
551 	    kCFStringEncodingUTF8, FALSE);
552 
553     if (utf8_str == NULL) {
554 	if (actualLen)
555 	    *actualLen = 0;
556 	return NULL;
557     }
558 
559     convertRange = CFRangeMake(0, CFStringGetLength(utf8_str));
560     result = (UniChar *)alloc(convertRange.length * sizeof(UniChar));
561 
562     CFStringGetCharacters(utf8_str, convertRange, result);
563 
564     CFRelease(utf8_str);
565 
566     if (actualLen)
567 	*actualLen = convertRange.length * sizeof(UniChar);
568 
569     return result;
570 }
571 
572 /*
573  * Sets LANG environment variable in Vim from Mac locale
574  */
575     void
576 mac_lang_init(void)
577 {
578     if (mch_getenv((char_u *)"LANG") == NULL)
579     {
580 	char	buf[20];
581 	if (LocaleRefGetPartString(NULL,
582 		    kLocaleLanguageMask | kLocaleLanguageVariantMask |
583 		    kLocaleRegionMask | kLocaleRegionVariantMask,
584 		    sizeof buf, buf) == noErr && *buf)
585 	{
586 	    vim_setenv((char_u *)"LANG", (char_u *)buf);
587 #   ifdef HAVE_LOCALE_H
588 	    setlocale(LC_ALL, "");
589 #   endif
590 	}
591     }
592 }
593 #endif /* MACOS_CONVERT */
594