xref: /vim-8.2.3635/src/os_mac_conv.c (revision a5fe91e6)
1 /* vi:set ts=8 sts=4 sw=4 noet:
2  *
3  * VIM - Vi IMproved	by Bram Moolenaar
4  *
5  * Do ":help uganda"  in Vim to read copying and usage conditions.
6  * Do ":help credits" in Vim to see a list of people who contributed.
7  * See README.txt for an overview of the Vim source code.
8  */
9 /*
10  * os_mac_conv.c: Code specifically for Mac string conversions.
11  *
12  * This code has been put in a separate file to avoid the conflicts that are
13  * caused by including both the X11 and Carbon header files.
14  */
15 
16 #define NO_X11_INCLUDES
17 
18 #include "vim.h"
19 
20 #if !defined(PROTO)
21 # include <CoreServices/CoreServices.h>
22 #endif
23 
24 
25 #if defined(MACOS_CONVERT) || defined(PROTO)
26 
27 # ifdef PROTO
28 // A few dummy types to be able to generate function prototypes.
29 typedef int UniChar;
30 typedef int *TECObjectRef;
31 typedef int CFStringRef;
32 # endif
33 
34 static char_u	    *mac_utf16_to_utf8(UniChar *from, size_t fromLen, size_t *actualLen);
35 static UniChar	    *mac_utf8_to_utf16(char_u *from, size_t fromLen, size_t *actualLen);
36 
37 // Converter for composing decomposed HFS+ file paths
38 static TECObjectRef gPathConverter;
39 // Converter used by mac_utf16_to_utf8
40 static TECObjectRef gUTF16ToUTF8Converter;
41 
42 /*
43  * A Mac version of string_convert_ext() for special cases.
44  */
45     char_u *
mac_string_convert(char_u * ptr,int len,int * lenp,int fail_on_error,int from_enc,int to_enc,int * unconvlenp)46 mac_string_convert(
47     char_u		*ptr,
48     int			len,
49     int			*lenp,
50     int			fail_on_error,
51     int			from_enc,
52     int			to_enc,
53     int			*unconvlenp)
54 {
55     char_u		*retval, *d;
56     CFStringRef		cfstr;
57     int			buflen, in, out, l, i;
58     CFStringEncoding	from;
59     CFStringEncoding	to;
60 
61     switch (from_enc)
62     {
63 	case 'l':   from = kCFStringEncodingISOLatin1; break;
64 	case 'm':   from = kCFStringEncodingMacRoman; break;
65 	case 'u':   from = kCFStringEncodingUTF8; break;
66 	default:    return NULL;
67     }
68     switch (to_enc)
69     {
70 	case 'l':   to = kCFStringEncodingISOLatin1; break;
71 	case 'm':   to = kCFStringEncodingMacRoman; break;
72 	case 'u':   to = kCFStringEncodingUTF8; break;
73 	default:    return NULL;
74     }
75 
76     if (unconvlenp != NULL)
77 	*unconvlenp = 0;
78     cfstr = CFStringCreateWithBytes(NULL, ptr, len, from, 0);
79 
80     if (cfstr == NULL)
81 	fprintf(stderr, "Encoding failed\n");
82     // When conversion failed, try excluding bytes from the end, helps when
83     // there is an incomplete byte sequence.  Only do up to 6 bytes to avoid
84     // looping a long time when there really is something unconvertible.
85     while (cfstr == NULL && unconvlenp != NULL && len > 1 && *unconvlenp < 6)
86     {
87 	--len;
88 	++*unconvlenp;
89 	cfstr = CFStringCreateWithBytes(NULL, ptr, len, from, 0);
90     }
91     if (cfstr == NULL)
92 	return NULL;
93 
94     if (to == kCFStringEncodingUTF8)
95 	buflen = len * 6 + 1;
96     else
97 	buflen = len + 1;
98     retval = alloc(buflen);
99     if (retval == NULL)
100     {
101 	CFRelease(cfstr);
102 	return NULL;
103     }
104 
105 #if 0
106     CFRange convertRange = CFRangeMake(0, CFStringGetLength(cfstr));
107     //  Determine output buffer size
108     CFStringGetBytes(cfstr, convertRange, to, NULL, FALSE, NULL, 0, (CFIndex *)&buflen);
109     retval = (buflen > 0) ? alloc(buflen) : NULL;
110     if (retval == NULL) {
111 	CFRelease(cfstr);
112 	return NULL;
113     }
114 
115     if (lenp)
116 	*lenp = buflen / sizeof(char_u);
117 
118     if (!CFStringGetBytes(cfstr, convertRange, to, NULL, FALSE, retval, buflen, NULL))
119 #endif
120     if (!CFStringGetCString(cfstr, (char *)retval, buflen, to))
121     {
122 	CFRelease(cfstr);
123 	if (fail_on_error)
124 	{
125 	    vim_free(retval);
126 	    return NULL;
127 	}
128 
129 	fprintf(stderr, "Trying char-by-char conversion...\n");
130 	// conversion failed for the whole string, but maybe it will work
131 	// for each character
132 	for (d = retval, in = 0, out = 0; in < len && out < buflen - 1;)
133 	{
134 	    if (from == kCFStringEncodingUTF8)
135 		l = utf_ptr2len(ptr + in);
136 	    else
137 		l = 1;
138 	    cfstr = CFStringCreateWithBytes(NULL, ptr + in, l, from, 0);
139 	    if (cfstr == NULL)
140 	    {
141 		*d++ = '?';
142 		out++;
143 	    }
144 	    else
145 	    {
146 		if (!CFStringGetCString(cfstr, (char *)d, buflen - out, to))
147 		{
148 		    *d++ = '?';
149 		    out++;
150 		}
151 		else
152 		{
153 		    i = STRLEN(d);
154 		    d += i;
155 		    out += i;
156 		}
157 		CFRelease(cfstr);
158 	    }
159 	    in += l;
160 	}
161 	*d = NUL;
162 	if (lenp != NULL)
163 	    *lenp = out;
164 	return retval;
165     }
166     CFRelease(cfstr);
167     if (lenp != NULL)
168 	*lenp = STRLEN(retval);
169 
170     return retval;
171 }
172 
173 /*
174  * Conversion from Apple MacRoman char encoding to UTF-8 or latin1, using
175  * standard Carbon framework.
176  * Input: "ptr[*sizep]".
177  * "real_size" is the size of the buffer that "ptr" points to.
178  * output is in-place, "sizep" is adjusted.
179  * Returns OK or FAIL.
180  */
181     int
macroman2enc(char_u * ptr,long * sizep,long real_size)182 macroman2enc(
183     char_u	*ptr,
184     long	*sizep,
185     long	real_size)
186 {
187     CFStringRef		cfstr;
188     CFRange		r;
189     CFIndex		len = *sizep;
190 
191     // MacRoman is an 8-bit encoding, no need to move bytes to
192     // conv_rest[].
193     cfstr = CFStringCreateWithBytes(NULL, ptr, len,
194 						kCFStringEncodingMacRoman, 0);
195     /*
196      * If there is a conversion error, try using another
197      * conversion.
198      */
199     if (cfstr == NULL)
200 	return FAIL;
201 
202     r.location = 0;
203     r.length = CFStringGetLength(cfstr);
204     if (r.length != CFStringGetBytes(cfstr, r,
205 	    (enc_utf8) ? kCFStringEncodingUTF8 : kCFStringEncodingISOLatin1,
206 	    0, // no lossy conversion
207 	    0, // not external representation
208 	    ptr + *sizep, real_size - *sizep, &len))
209     {
210 	CFRelease(cfstr);
211 	return FAIL;
212     }
213     CFRelease(cfstr);
214     mch_memmove(ptr, ptr + *sizep, len);
215     *sizep = len;
216 
217     return OK;
218 }
219 
220 /*
221  * Conversion from UTF-8 or latin1 to MacRoman.
222  * Input: "from[fromlen]"
223  * Output: "to[maxtolen]" length in "*tolenp"
224  * Unconverted rest in rest[*restlenp].
225  * Returns OK or FAIL.
226  */
227     int
enc2macroman(char_u * from,size_t fromlen,char_u * to,int * tolenp,int maxtolen,char_u * rest,int * restlenp)228 enc2macroman(
229     char_u	*from,
230     size_t	fromlen,
231     char_u	*to,
232     int		*tolenp,
233     int		maxtolen,
234     char_u	*rest,
235     int		*restlenp)
236 {
237     CFStringRef	cfstr;
238     CFRange	r;
239     CFIndex	l;
240 
241     *restlenp = 0;
242     cfstr = CFStringCreateWithBytes(NULL, from, fromlen,
243 	    (enc_utf8) ? kCFStringEncodingUTF8 : kCFStringEncodingISOLatin1,
244 	    0);
245     while (cfstr == NULL && *restlenp < 3 && fromlen > 1)
246     {
247 	rest[*restlenp++] = from[--fromlen];
248 	cfstr = CFStringCreateWithBytes(NULL, from, fromlen,
249 		(enc_utf8) ? kCFStringEncodingUTF8 : kCFStringEncodingISOLatin1,
250 		0);
251     }
252     if (cfstr == NULL)
253 	return FAIL;
254 
255     r.location = 0;
256     r.length = CFStringGetLength(cfstr);
257     if (r.length != CFStringGetBytes(cfstr, r,
258 		kCFStringEncodingMacRoman,
259 		0, // no lossy conversion
260 		0, // not external representation (since vim
261 		   // handles this internally
262 		to, maxtolen, &l))
263     {
264 	CFRelease(cfstr);
265 	return FAIL;
266     }
267     CFRelease(cfstr);
268     *tolenp = l;
269     return OK;
270 }
271 
272 /*
273  * Initializes text converters
274  */
275     void
mac_conv_init(void)276 mac_conv_init(void)
277 {
278     TextEncoding    utf8_encoding;
279     TextEncoding    utf8_hfsplus_encoding;
280     TextEncoding    utf8_canon_encoding;
281     TextEncoding    utf16_encoding;
282 
283     utf8_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
284 	    kTextEncodingDefaultVariant, kUnicodeUTF8Format);
285     utf8_hfsplus_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
286 	    kUnicodeHFSPlusCompVariant, kUnicodeUTF8Format);
287     utf8_canon_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
288 	    kUnicodeCanonicalCompVariant, kUnicodeUTF8Format);
289     utf16_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
290 	    kTextEncodingDefaultVariant, kUnicode16BitFormat);
291 
292     if (TECCreateConverter(&gPathConverter, utf8_encoding,
293 		utf8_hfsplus_encoding) != noErr)
294 	gPathConverter = NULL;
295 
296     if (TECCreateConverter(&gUTF16ToUTF8Converter, utf16_encoding,
297 		utf8_canon_encoding) != noErr)
298     {
299 	// On pre-10.3, Unicode normalization is not available so
300 	// fall back to non-normalizing converter
301 	if (TECCreateConverter(&gUTF16ToUTF8Converter, utf16_encoding,
302 		    utf8_encoding) != noErr)
303 	    gUTF16ToUTF8Converter = NULL;
304     }
305 }
306 
307 /*
308  * Destroys text converters
309  */
310     void
mac_conv_cleanup(void)311 mac_conv_cleanup(void)
312 {
313     if (gUTF16ToUTF8Converter)
314     {
315 	TECDisposeConverter(gUTF16ToUTF8Converter);
316 	gUTF16ToUTF8Converter = NULL;
317     }
318 
319     if (gPathConverter)
320     {
321 	TECDisposeConverter(gPathConverter);
322 	gPathConverter = NULL;
323     }
324 }
325 
326 /*
327  * Conversion from UTF-16 UniChars to 'encoding'
328  * The function signature uses the real type of UniChar (as typedef'ed in
329  * CFBase.h) to avoid clashes with X11 header files in the .pro file
330  */
331     char_u *
mac_utf16_to_enc(unsigned short * from,size_t fromLen,size_t * actualLen)332 mac_utf16_to_enc(
333     unsigned short *from,
334     size_t fromLen,
335     size_t *actualLen)
336 {
337     // Following code borrows somewhat from os_mswin.c
338     vimconv_T	conv;
339     size_t      utf8_len;
340     char_u      *utf8_str;
341     char_u      *result = NULL;
342 
343     // Convert to utf-8 first, works better with iconv
344     utf8_len = 0;
345     utf8_str = mac_utf16_to_utf8(from, fromLen, &utf8_len);
346 
347     if (utf8_str)
348     {
349 	// We might be called before we have p_enc set up.
350 	conv.vc_type = CONV_NONE;
351 
352 	// If encoding (p_enc) is any unicode, it is actually in utf-8 (vim
353 	// internal unicode is always utf-8) so don't convert in such cases
354 
355 	if ((enc_canon_props(p_enc) & ENC_UNICODE) == 0)
356 	    convert_setup(&conv, (char_u *)"utf-8",
357 		    p_enc? p_enc: (char_u *)"macroman");
358 	if (conv.vc_type == CONV_NONE)
359 	{
360 	    // p_enc is utf-8, so we're done.
361 	    result = utf8_str;
362 	}
363 	else
364 	{
365 	    result = string_convert(&conv, utf8_str, (int *)&utf8_len);
366 	    vim_free(utf8_str);
367 	}
368 
369 	convert_setup(&conv, NULL, NULL);
370 
371 	if (actualLen)
372 	    *actualLen = utf8_len;
373     }
374     else if (actualLen)
375 	*actualLen = 0;
376 
377     return result;
378 }
379 
380 /*
381  * Conversion from 'encoding' to UTF-16 UniChars
382  * The function return uses the real type of UniChar (as typedef'ed in
383  * CFBase.h) to avoid clashes with X11 header files in the .pro file
384  */
385     unsigned short *
mac_enc_to_utf16(char_u * from,size_t fromLen,size_t * actualLen)386 mac_enc_to_utf16(
387     char_u *from,
388     size_t fromLen,
389     size_t *actualLen)
390 {
391     // Following code borrows somewhat from os_mswin.c
392     vimconv_T	conv;
393     size_t      utf8_len;
394     char_u      *utf8_str;
395     UniChar     *result = NULL;
396     Boolean     should_free_utf8 = FALSE;
397 
398     do
399     {
400 	// Use MacRoman by default, we might be called before we have p_enc
401 	// set up.  Convert to utf-8 first, works better with iconv().  Does
402 	// nothing if 'encoding' is "utf-8".
403 	conv.vc_type = CONV_NONE;
404 	if ((enc_canon_props(p_enc) & ENC_UNICODE) == 0 &&
405 		convert_setup(&conv, p_enc ? p_enc : (char_u *)"macroman",
406 		    (char_u *)"utf-8") == FAIL)
407 	    break;
408 
409 	if (conv.vc_type != CONV_NONE)
410 	{
411 	    utf8_len = fromLen;
412 	    utf8_str = string_convert(&conv, from, (int *)&utf8_len);
413 	    should_free_utf8 = TRUE;
414 	}
415 	else
416 	{
417 	    utf8_str = from;
418 	    utf8_len = fromLen;
419 	}
420 
421 	if (utf8_str == NULL)
422 	    break;
423 
424 	convert_setup(&conv, NULL, NULL);
425 
426 	result = mac_utf8_to_utf16(utf8_str, utf8_len, actualLen);
427 
428 	if (should_free_utf8)
429 	    vim_free(utf8_str);
430 	return result;
431     }
432     while (0);
433 
434     if (actualLen)
435 	*actualLen = 0;
436 
437     return result;
438 }
439 
440 /*
441  * Converts from UTF-16 UniChars to CFString
442  * The void * return type is actually a CFStringRef
443  */
444     void *
mac_enc_to_cfstring(char_u * from,size_t fromLen)445 mac_enc_to_cfstring(
446     char_u  *from,
447     size_t  fromLen)
448 {
449     UniChar	*utf16_str;
450     size_t	utf16_len;
451     CFStringRef	result = NULL;
452 
453     utf16_str = mac_enc_to_utf16(from, fromLen, &utf16_len);
454     if (utf16_str)
455     {
456 	result = CFStringCreateWithCharacters(NULL, utf16_str, utf16_len/sizeof(UniChar));
457 	vim_free(utf16_str);
458     }
459 
460     return (void *)result;
461 }
462 
463 /*
464  * Converts a decomposed HFS+ UTF-8 path to precomposed UTF-8
465  */
466     char_u *
mac_precompose_path(char_u * decompPath,size_t decompLen,size_t * precompLen)467 mac_precompose_path(
468     char_u  *decompPath,
469     size_t  decompLen,
470     size_t  *precompLen)
471 {
472     char_u  *result = NULL;
473     size_t  actualLen = 0;
474 
475     if (gPathConverter)
476     {
477 	result = alloc(decompLen);
478 	if (result)
479 	{
480 	    if (TECConvertText(gPathConverter, decompPath,
481 			decompLen, &decompLen, result,
482 			decompLen, &actualLen) != noErr)
483 		VIM_CLEAR(result);
484 	}
485     }
486 
487     if (precompLen)
488 	*precompLen = actualLen;
489 
490     return result;
491 }
492 
493 /*
494  * Converts from UTF-16 UniChars to precomposed UTF-8
495  */
496     static char_u *
mac_utf16_to_utf8(UniChar * from,size_t fromLen,size_t * actualLen)497 mac_utf16_to_utf8(
498     UniChar *from,
499     size_t fromLen,
500     size_t *actualLen)
501 {
502     ByteCount		utf8_len;
503     ByteCount		inputRead;
504     char_u		*result;
505 
506     if (gUTF16ToUTF8Converter)
507     {
508 	result = alloc(fromLen * 6 + 1);
509 	if (result && TECConvertText(gUTF16ToUTF8Converter, (ConstTextPtr)from,
510 		    fromLen, &inputRead, result,
511 		    (fromLen*6+1)*sizeof(char_u), &utf8_len) == noErr)
512 	{
513 	    TECFlushText(gUTF16ToUTF8Converter, result, (fromLen*6+1)*sizeof(char_u), &inputRead);
514 	    utf8_len += inputRead;
515 	}
516 	else
517 	    VIM_CLEAR(result);
518     }
519     else
520     {
521 	result = NULL;
522     }
523 
524     if (actualLen)
525 	*actualLen = result ? utf8_len : 0;
526 
527     return result;
528 }
529 
530 /*
531  * Converts from UTF-8 to UTF-16 UniChars
532  */
533     static UniChar *
mac_utf8_to_utf16(char_u * from,size_t fromLen,size_t * actualLen)534 mac_utf8_to_utf16(
535     char_u *from,
536     size_t fromLen,
537     size_t *actualLen)
538 {
539     CFStringRef  utf8_str;
540     CFRange      convertRange;
541     UniChar      *result = NULL;
542 
543     utf8_str = CFStringCreateWithBytes(NULL, from, fromLen,
544 	    kCFStringEncodingUTF8, FALSE);
545 
546     if (utf8_str == NULL) {
547 	if (actualLen)
548 	    *actualLen = 0;
549 	return NULL;
550     }
551 
552     convertRange = CFRangeMake(0, CFStringGetLength(utf8_str));
553     result = ALLOC_MULT(UniChar, convertRange.length);
554 
555     CFStringGetCharacters(utf8_str, convertRange, result);
556 
557     CFRelease(utf8_str);
558 
559     if (actualLen)
560 	*actualLen = convertRange.length * sizeof(UniChar);
561 
562     return result;
563 }
564 
565 /*
566  * Sets LANG environment variable in Vim from Mac locale
567  */
568     void
mac_lang_init(void)569 mac_lang_init(void)
570 {
571     if (mch_getenv((char_u *)"LANG") == NULL)
572     {
573 	char	buf[50];
574 
575 	// $LANG is not set, either because it was unset or Vim was started
576 	// from the Dock.  Query the system locale.
577 	if (LocaleRefGetPartString(NULL,
578 		    kLocaleLanguageMask | kLocaleLanguageVariantMask |
579 		    kLocaleRegionMask | kLocaleRegionVariantMask,
580 		    sizeof(buf) - 10, buf) == noErr && *buf)
581 	{
582 	    if (strcasestr(buf, "utf-8") == NULL)
583 		strcat(buf, ".UTF-8");
584 	    vim_setenv((char_u *)"LANG", (char_u *)buf);
585 #   ifdef HAVE_LOCALE_H
586 	    setlocale(LC_ALL, "");
587 #   endif
588 #   if defined(FEAT_FLOAT) && defined(LC_NUMERIC)
589 	    // Make sure strtod() uses a decimal point, not a comma.
590 	    setlocale(LC_NUMERIC, "C");
591 #   endif
592 	}
593     }
594 }
595 #endif // MACOS_CONVERT
596