xref: /vim-8.2.3635/src/os_mac_conv.c (revision 2bf24176)
1 /* vi:set ts=8 sts=4 sw=4:
2  *
3  * VIM - Vi IMproved	by Bram Moolenaar
4  *
5  * Do ":help uganda"  in Vim to read copying and usage conditions.
6  * Do ":help credits" in Vim to see a list of people who contributed.
7  * See README.txt for an overview of the Vim source code.
8  */
9 /*
10  * os_mac_conv.c: Code specifically for Mac string conversions.
11  *
12  * This code has been put in a separate file to avoid the conflicts that are
13  * caused by including both the X11 and Carbon header files.
14  */
15 
16 #define NO_X11_INCLUDES
17 #define BalloonEval int   /* used in header files */
18 
19 #include "vim.h"
20 #ifndef FEAT_GUI_MAC
21 # include <CoreServices/CoreServices.h>
22 #endif
23 
24 
25 #if defined(MACOS_CONVERT) || defined(PROTO)
26 
27 # ifdef PROTO
28 /* A few dummy types to be able to generate function prototypes. */
29 typedef int UniChar;
30 typedef int *TECObjectRef;
31 typedef int CFStringRef;
32 # endif
33 
34 static char_u	    *mac_utf16_to_utf8 __ARGS((UniChar *from, size_t fromLen, size_t *actualLen));
35 static UniChar	    *mac_utf8_to_utf16 __ARGS((char_u *from, size_t fromLen, size_t *actualLen));
36 
37 /* Converter for composing decomposed HFS+ file paths */
38 static TECObjectRef gPathConverter;
39 /* Converter used by mac_utf16_to_utf8 */
40 static TECObjectRef gUTF16ToUTF8Converter;
41 
42 /*
43  * A Mac version of string_convert_ext() for special cases.
44  */
45     char_u *
46 mac_string_convert(ptr, len, lenp, fail_on_error, from_enc, to_enc, unconvlenp)
47     char_u		*ptr;
48     int			len;
49     int			*lenp;
50     int			fail_on_error;
51     int			from_enc;
52     int			to_enc;
53     int			*unconvlenp;
54 {
55     char_u		*retval, *d;
56     CFStringRef		cfstr;
57     int			buflen, in, out, l, i;
58     CFStringEncoding	from;
59     CFStringEncoding	to;
60 
61     switch (from_enc)
62     {
63 	case 'l':   from = kCFStringEncodingISOLatin1; break;
64 	case 'm':   from = kCFStringEncodingMacRoman; break;
65 	case 'u':   from = kCFStringEncodingUTF8; break;
66 	default:    return NULL;
67     }
68     switch (to_enc)
69     {
70 	case 'l':   to = kCFStringEncodingISOLatin1; break;
71 	case 'm':   to = kCFStringEncodingMacRoman; break;
72 	case 'u':   to = kCFStringEncodingUTF8; break;
73 	default:    return NULL;
74     }
75 
76     if (unconvlenp != NULL)
77 	*unconvlenp = 0;
78     cfstr = CFStringCreateWithBytes(NULL, ptr, len, from, 0);
79 
80     if (cfstr == NULL)
81 	fprintf(stderr, "Encoding failed\n");
82     /* When conversion failed, try excluding bytes from the end, helps when
83      * there is an incomplete byte sequence.  Only do up to 6 bytes to avoid
84      * looping a long time when there really is something unconvertible. */
85     while (cfstr == NULL && unconvlenp != NULL && len > 1 && *unconvlenp < 6)
86     {
87 	--len;
88 	++*unconvlenp;
89 	cfstr = CFStringCreateWithBytes(NULL, ptr, len, from, 0);
90     }
91     if (cfstr == NULL)
92 	return NULL;
93 
94     if (to == kCFStringEncodingUTF8)
95 	buflen = len * 6 + 1;
96     else
97 	buflen = len + 1;
98     retval = alloc(buflen);
99     if (retval == NULL)
100     {
101 	CFRelease(cfstr);
102 	return NULL;
103     }
104 
105 #if 0
106     CFRange convertRange = CFRangeMake(0, CFStringGetLength(cfstr));
107     /*  Determine output buffer size */
108     CFStringGetBytes(cfstr, convertRange, to, NULL, FALSE, NULL, 0, (CFIndex *)&buflen);
109     retval = (buflen > 0) ? alloc(buflen) : NULL;
110     if (retval == NULL) {
111 	CFRelease(cfstr);
112 	return NULL;
113     }
114 
115     if (lenp)
116 	*lenp = buflen / sizeof(char_u);
117 
118     if (!CFStringGetBytes(cfstr, convertRange, to, NULL, FALSE, retval, buflen, NULL))
119 #endif
120     if (!CFStringGetCString(cfstr, (char *)retval, buflen, to))
121     {
122 	CFRelease(cfstr);
123 	if (fail_on_error)
124 	{
125 	    vim_free(retval);
126 	    return NULL;
127 	}
128 
129 	fprintf(stderr, "Trying char-by-char conversion...\n");
130 	/* conversion failed for the whole string, but maybe it will work
131 	 * for each character */
132 	for (d = retval, in = 0, out = 0; in < len && out < buflen - 1;)
133 	{
134 	    if (from == kCFStringEncodingUTF8)
135 		l = utf_ptr2len(ptr + in);
136 	    else
137 		l = 1;
138 	    cfstr = CFStringCreateWithBytes(NULL, ptr + in, l, from, 0);
139 	    if (cfstr == NULL)
140 	    {
141 		*d++ = '?';
142 		out++;
143 	    }
144 	    else
145 	    {
146 		if (!CFStringGetCString(cfstr, (char *)d, buflen - out, to))
147 		{
148 		    *d++ = '?';
149 		    out++;
150 		}
151 		else
152 		{
153 		    i = STRLEN(d);
154 		    d += i;
155 		    out += i;
156 		}
157 		CFRelease(cfstr);
158 	    }
159 	    in += l;
160 	}
161 	*d = NUL;
162 	if (lenp != NULL)
163 	    *lenp = out;
164 	return retval;
165     }
166     CFRelease(cfstr);
167     if (lenp != NULL)
168 	*lenp = STRLEN(retval);
169 
170     return retval;
171 }
172 
173 /*
174  * Conversion from Apple MacRoman char encoding to UTF-8 or latin1, using
175  * standard Carbon framework.
176  * Input: "ptr[*sizep]".
177  * "real_size" is the size of the buffer that "ptr" points to.
178  * output is in-place, "sizep" is adjusted.
179  * Returns OK or FAIL.
180  */
181     int
182 macroman2enc(ptr, sizep, real_size)
183     char_u	*ptr;
184     long	*sizep;
185     long	real_size;
186 {
187     CFStringRef		cfstr;
188     CFRange		r;
189     CFIndex		len = *sizep;
190 
191     /* MacRoman is an 8-bit encoding, no need to move bytes to
192      * conv_rest[]. */
193     cfstr = CFStringCreateWithBytes(NULL, ptr, len,
194 						kCFStringEncodingMacRoman, 0);
195     /*
196      * If there is a conversion error, try using another
197      * conversion.
198      */
199     if (cfstr == NULL)
200 	return FAIL;
201 
202     r.location = 0;
203     r.length = CFStringGetLength(cfstr);
204     if (r.length != CFStringGetBytes(cfstr, r,
205 	    (enc_utf8) ? kCFStringEncodingUTF8 : kCFStringEncodingISOLatin1,
206 	    0, /* no lossy conversion */
207 	    0, /* not external representation */
208 	    ptr + *sizep, real_size - *sizep, &len))
209     {
210 	CFRelease(cfstr);
211 	return FAIL;
212     }
213     CFRelease(cfstr);
214     mch_memmove(ptr, ptr + *sizep, len);
215     *sizep = len;
216 
217     return OK;
218 }
219 
220 /*
221  * Conversion from UTF-8 or latin1 to MacRoman.
222  * Input: "from[fromlen]"
223  * Output: "to[maxtolen]" length in "*tolenp"
224  * Unconverted rest in rest[*restlenp].
225  * Returns OK or FAIL.
226  */
227     int
228 enc2macroman(from, fromlen, to, tolenp, maxtolen, rest, restlenp)
229     char_u	*from;
230     size_t	fromlen;
231     char_u	*to;
232     int		*tolenp;
233     int		maxtolen;
234     char_u	*rest;
235     int		*restlenp;
236 {
237     CFStringRef	cfstr;
238     CFRange	r;
239     CFIndex	l;
240 
241     *restlenp = 0;
242     cfstr = CFStringCreateWithBytes(NULL, from, fromlen,
243 	    (enc_utf8) ? kCFStringEncodingUTF8 : kCFStringEncodingISOLatin1,
244 	    0);
245     while (cfstr == NULL && *restlenp < 3 && fromlen > 1)
246     {
247 	rest[*restlenp++] = from[--fromlen];
248 	cfstr = CFStringCreateWithBytes(NULL, from, fromlen,
249 		(enc_utf8) ? kCFStringEncodingUTF8 : kCFStringEncodingISOLatin1,
250 		0);
251     }
252     if (cfstr == NULL)
253 	return FAIL;
254 
255     r.location = 0;
256     r.length = CFStringGetLength(cfstr);
257     if (r.length != CFStringGetBytes(cfstr, r,
258 		kCFStringEncodingMacRoman,
259 		0, /* no lossy conversion */
260 		0, /* not external representation (since vim
261 		    * handles this internally */
262 		to, maxtolen, &l))
263     {
264 	CFRelease(cfstr);
265 	return FAIL;
266     }
267     CFRelease(cfstr);
268     *tolenp = l;
269     return OK;
270 }
271 
272 /*
273  * Initializes text converters
274  */
275     void
276 mac_conv_init()
277 {
278     TextEncoding    utf8_encoding;
279     TextEncoding    utf8_hfsplus_encoding;
280     TextEncoding    utf8_canon_encoding;
281     TextEncoding    utf16_encoding;
282 
283     utf8_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
284 	    kTextEncodingDefaultVariant, kUnicodeUTF8Format);
285     utf8_hfsplus_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
286 	    kUnicodeHFSPlusCompVariant, kUnicodeUTF8Format);
287     utf8_canon_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
288 	    kUnicodeCanonicalCompVariant, kUnicodeUTF8Format);
289     utf16_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
290 	    kTextEncodingDefaultVariant, kUnicode16BitFormat);
291 
292     if (TECCreateConverter(&gPathConverter, utf8_encoding,
293 		utf8_hfsplus_encoding) != noErr)
294 	gPathConverter = NULL;
295 
296     if (TECCreateConverter(&gUTF16ToUTF8Converter, utf16_encoding,
297 		utf8_canon_encoding) != noErr)
298     {
299 	/* On pre-10.3, Unicode normalization is not available so
300 	 * fall back to non-normalizing converter */
301 	if (TECCreateConverter(&gUTF16ToUTF8Converter, utf16_encoding,
302 		    utf8_encoding) != noErr)
303 	    gUTF16ToUTF8Converter = NULL;
304     }
305 }
306 
307 /*
308  * Destroys text converters
309  */
310     void
311 mac_conv_cleanup()
312 {
313     if (gUTF16ToUTF8Converter)
314     {
315 	TECDisposeConverter(gUTF16ToUTF8Converter);
316 	gUTF16ToUTF8Converter = NULL;
317     }
318 
319     if (gPathConverter)
320     {
321 	TECDisposeConverter(gPathConverter);
322 	gPathConverter = NULL;
323     }
324 }
325 
326 /*
327  * Conversion from UTF-16 UniChars to 'encoding'
328  * The function signature uses the real type of UniChar (as typedef'ed in
329  * CFBase.h) to avoid clashes with X11 header files in the .pro file
330  */
331     char_u *
332 mac_utf16_to_enc(from, fromLen, actualLen)
333     unsigned short *from;
334     size_t fromLen;
335     size_t *actualLen;
336 {
337     /* Following code borrows somewhat from os_mswin.c */
338     vimconv_T	conv;
339     size_t      utf8_len;
340     char_u      *utf8_str;
341     char_u      *result = NULL;
342 
343     /* Convert to utf-8 first, works better with iconv */
344     utf8_len = 0;
345     utf8_str = mac_utf16_to_utf8(from, fromLen, &utf8_len);
346 
347     if (utf8_str)
348     {
349 	/* We might be called before we have p_enc set up. */
350 	conv.vc_type = CONV_NONE;
351 
352 	/* If encoding (p_enc) is any unicode, it is actually in utf-8 (vim
353 	 * internal unicode is always utf-8) so don't convert in such cases */
354 
355 	if ((enc_canon_props(p_enc) & ENC_UNICODE) == 0)
356 	    convert_setup(&conv, (char_u *)"utf-8",
357 		    p_enc? p_enc: (char_u *)"macroman");
358 	if (conv.vc_type == CONV_NONE)
359 	{
360 	    /* p_enc is utf-8, so we're done. */
361 	    result = utf8_str;
362 	}
363 	else
364 	{
365 	    result = string_convert(&conv, utf8_str, (int *)&utf8_len);
366 	    vim_free(utf8_str);
367 	}
368 
369 	convert_setup(&conv, NULL, NULL);
370 
371 	if (actualLen)
372 	    *actualLen = utf8_len;
373     }
374     else if (actualLen)
375 	*actualLen = 0;
376 
377     return result;
378 }
379 
380 /*
381  * Conversion from 'encoding' to UTF-16 UniChars
382  * The function return uses the real type of UniChar (as typedef'ed in
383  * CFBase.h) to avoid clashes with X11 header files in the .pro file
384  */
385     unsigned short *
386 mac_enc_to_utf16(from, fromLen, actualLen)
387     char_u *from;
388     size_t fromLen;
389     size_t *actualLen;
390 {
391     /* Following code borrows somewhat from os_mswin.c */
392     vimconv_T	conv;
393     size_t      utf8_len;
394     char_u      *utf8_str;
395     UniChar     *result = NULL;
396     Boolean     should_free_utf8 = FALSE;
397 
398     do
399     {
400 	/* Use MacRoman by default, we might be called before we have p_enc
401 	 * set up.  Convert to utf-8 first, works better with iconv().  Does
402 	 * nothing if 'encoding' is "utf-8". */
403 	conv.vc_type = CONV_NONE;
404 	if ((enc_canon_props(p_enc) & ENC_UNICODE) == 0 &&
405 		convert_setup(&conv, p_enc ? p_enc : (char_u *)"macroman",
406 		    (char_u *)"utf-8") == FAIL)
407 	    break;
408 
409 	if (conv.vc_type != CONV_NONE)
410 	{
411 	    utf8_len = fromLen;
412 	    utf8_str = string_convert(&conv, from, (int *)&utf8_len);
413 	    should_free_utf8 = TRUE;
414 	}
415 	else
416 	{
417 	    utf8_str = from;
418 	    utf8_len = fromLen;
419 	}
420 
421 	if (utf8_str == NULL)
422 	    break;
423 
424 	convert_setup(&conv, NULL, NULL);
425 
426 	result = mac_utf8_to_utf16(utf8_str, utf8_len, actualLen);
427 
428 	if (should_free_utf8)
429 	    vim_free(utf8_str);
430 	return result;
431     }
432     while (0);
433 
434     if (actualLen)
435 	*actualLen = 0;
436 
437     return result;
438 }
439 
440 /*
441  * Converts from UTF-16 UniChars to CFString
442  * The void * return type is actually a CFStringRef
443  */
444     void *
445 mac_enc_to_cfstring(from, fromLen)
446     char_u  *from;
447     size_t  fromLen;
448 {
449     UniChar	*utf16_str;
450     size_t	utf16_len;
451     CFStringRef	result = NULL;
452 
453     utf16_str = mac_enc_to_utf16(from, fromLen, &utf16_len);
454     if (utf16_str)
455     {
456 	result = CFStringCreateWithCharacters(NULL, utf16_str, utf16_len/sizeof(UniChar));
457 	vim_free(utf16_str);
458     }
459 
460     return (void *)result;
461 }
462 
463 /*
464  * Converts a decomposed HFS+ UTF-8 path to precomposed UTF-8
465  */
466     char_u *
467 mac_precompose_path(decompPath, decompLen, precompLen)
468     char_u  *decompPath;
469     size_t  decompLen;
470     size_t  *precompLen;
471 {
472     char_u  *result = NULL;
473     size_t  actualLen = 0;
474 
475     if (gPathConverter)
476     {
477 	result = alloc(decompLen);
478 	if (result)
479 	{
480 	    if (TECConvertText(gPathConverter, decompPath,
481 			decompLen, &decompLen, result,
482 			decompLen, &actualLen) != noErr)
483 	    {
484 		vim_free(result);
485 		result = NULL;
486 	    }
487 	}
488     }
489 
490     if (precompLen)
491 	*precompLen = actualLen;
492 
493     return result;
494 }
495 
496 /*
497  * Converts from UTF-16 UniChars to precomposed UTF-8
498  */
499     static char_u *
500 mac_utf16_to_utf8(from, fromLen, actualLen)
501     UniChar *from;
502     size_t fromLen;
503     size_t *actualLen;
504 {
505     ByteCount		utf8_len;
506     ByteCount		inputRead;
507     char_u		*result;
508 
509     if (gUTF16ToUTF8Converter)
510     {
511 	result = alloc(fromLen * 6 + 1);
512 	if (result && TECConvertText(gUTF16ToUTF8Converter, (ConstTextPtr)from,
513 		    fromLen, &inputRead, result,
514 		    (fromLen*6+1)*sizeof(char_u), &utf8_len) == noErr)
515 	{
516 	    TECFlushText(gUTF16ToUTF8Converter, result, (fromLen*6+1)*sizeof(char_u), &inputRead);
517 	    utf8_len += inputRead;
518 	}
519 	else
520 	{
521 	    vim_free(result);
522 	    result = NULL;
523 	}
524     }
525     else
526     {
527 	result = NULL;
528     }
529 
530     if (actualLen)
531 	*actualLen = result ? utf8_len : 0;
532 
533     return result;
534 }
535 
536 /*
537  * Converts from UTF-8 to UTF-16 UniChars
538  */
539     static UniChar *
540 mac_utf8_to_utf16(from, fromLen, actualLen)
541     char_u *from;
542     size_t fromLen;
543     size_t *actualLen;
544 {
545     CFStringRef  utf8_str;
546     CFRange      convertRange;
547     UniChar      *result = NULL;
548 
549     utf8_str = CFStringCreateWithBytes(NULL, from, fromLen,
550 	    kCFStringEncodingUTF8, FALSE);
551 
552     if (utf8_str == NULL) {
553 	if (actualLen)
554 	    *actualLen = 0;
555 	return NULL;
556     }
557 
558     convertRange = CFRangeMake(0, CFStringGetLength(utf8_str));
559     result = (UniChar *)alloc(convertRange.length * sizeof(UniChar));
560 
561     CFStringGetCharacters(utf8_str, convertRange, result);
562 
563     CFRelease(utf8_str);
564 
565     if (actualLen)
566 	*actualLen = convertRange.length * sizeof(UniChar);
567 
568     return result;
569 }
570 
571 /*
572  * Sets LANG environment variable in Vim from Mac locale
573  */
574     void
575 mac_lang_init() {
576     if (mch_getenv((char_u *)"LANG") == NULL)
577     {
578 	char	buf[20];
579 	if (LocaleRefGetPartString(NULL,
580 		    kLocaleLanguageMask | kLocaleLanguageVariantMask |
581 		    kLocaleRegionMask | kLocaleRegionVariantMask,
582 		    sizeof buf, buf) == noErr && *buf)
583 	{
584 	    vim_setenv((char_u *)"LANG", (char_u *)buf);
585 #   ifdef HAVE_LOCALE_H
586 	    setlocale(LC_ALL, "");
587 #   endif
588 	}
589     }
590 }
591 #endif /* MACOS_CONVERT */
592