xref: /vim-8.2.3635/src/libvterm/src/encoding.c (revision 591cec83)
1 #include "vterm_internal.h"
2 
3 #define UNICODE_INVALID 0xFFFD
4 
5 #if defined(DEBUG) && DEBUG > 1
6 # define DEBUG_PRINT_UTF8
7 #endif
8 
9 struct UTF8DecoderData {
10   // number of bytes remaining in this codepoint
11   int bytes_remaining;
12 
13   // number of bytes total in this codepoint once it's finished
14   // (for detecting overlongs)
15   int bytes_total;
16 
17   int this_cp;
18 };
19 
init_utf8(VTermEncoding * enc UNUSED,void * data_)20 static void init_utf8(VTermEncoding *enc UNUSED, void *data_)
21 {
22   struct UTF8DecoderData *data = data_;
23 
24   data->bytes_remaining = 0;
25   data->bytes_total     = 0;
26 }
27 
decode_utf8(VTermEncoding * enc UNUSED,void * data_,uint32_t cp[],int * cpi,int cplen,const char bytes[],size_t * pos,size_t bytelen)28 static void decode_utf8(VTermEncoding *enc UNUSED, void *data_,
29                         uint32_t cp[], int *cpi, int cplen,
30                         const char bytes[], size_t *pos, size_t bytelen)
31 {
32   struct UTF8DecoderData *data = data_;
33 
34 #ifdef DEBUG_PRINT_UTF8
35   printf("BEGIN UTF-8\n");
36 #endif
37 
38   for(; *pos < bytelen && *cpi < cplen; (*pos)++) {
39     unsigned char c = bytes[*pos];
40 
41 #ifdef DEBUG_PRINT_UTF8
42     printf(" pos=%zd c=%02x rem=%d\n", *pos, c, data->bytes_remaining);
43 #endif
44 
45     if(c < 0x20) // C0
46       return;
47 
48     else if(c >= 0x20 && c < 0x7f) {
49       if(data->bytes_remaining) {
50         data->bytes_remaining = 0;
51         cp[(*cpi)++] = UNICODE_INVALID;
52 	if (*cpi >= cplen)
53 	  break;
54       }
55       cp[(*cpi)++] = c;
56 #ifdef DEBUG_PRINT_UTF8
57       printf(" UTF-8 char: U+%04x\n", c);
58 #endif
59     }
60 
61     else if(c == 0x7f) // DEL
62       return;
63 
64     else if(c >= 0x80 && c < 0xc0) {
65       if(!data->bytes_remaining) {
66         cp[(*cpi)++] = UNICODE_INVALID;
67         continue;
68       }
69 
70       data->this_cp <<= 6;
71       data->this_cp |= c & 0x3f;
72       data->bytes_remaining--;
73 
74       if(!data->bytes_remaining) {
75 #ifdef DEBUG_PRINT_UTF8
76         printf(" UTF-8 raw char U+%04x bytelen=%d ", data->this_cp, data->bytes_total);
77 #endif
78         // Check for overlong sequences
79         switch(data->bytes_total) {
80         case 2:
81           if(data->this_cp <  0x0080) data->this_cp = UNICODE_INVALID;
82           break;
83         case 3:
84           if(data->this_cp <  0x0800) data->this_cp = UNICODE_INVALID;
85           break;
86         case 4:
87           if(data->this_cp < 0x10000) data->this_cp = UNICODE_INVALID;
88           break;
89         case 5:
90           if(data->this_cp < 0x200000) data->this_cp = UNICODE_INVALID;
91           break;
92         case 6:
93           if(data->this_cp < 0x4000000) data->this_cp = UNICODE_INVALID;
94           break;
95         }
96         // Now look for plain invalid ones
97         if((data->this_cp >= 0xD800 && data->this_cp <= 0xDFFF) ||
98            data->this_cp == 0xFFFE ||
99            data->this_cp == 0xFFFF)
100           data->this_cp = UNICODE_INVALID;
101 #ifdef DEBUG_PRINT_UTF8
102         printf(" char: U+%04x\n", data->this_cp);
103 #endif
104         cp[(*cpi)++] = data->this_cp;
105       }
106     }
107 
108     else if(c >= 0xc0 && c < 0xe0) {
109       if(data->bytes_remaining)
110         cp[(*cpi)++] = UNICODE_INVALID;
111 
112       data->this_cp = c & 0x1f;
113       data->bytes_total = 2;
114       data->bytes_remaining = 1;
115     }
116 
117     else if(c >= 0xe0 && c < 0xf0) {
118       if(data->bytes_remaining)
119         cp[(*cpi)++] = UNICODE_INVALID;
120 
121       data->this_cp = c & 0x0f;
122       data->bytes_total = 3;
123       data->bytes_remaining = 2;
124     }
125 
126     else if(c >= 0xf0 && c < 0xf8) {
127       if(data->bytes_remaining)
128         cp[(*cpi)++] = UNICODE_INVALID;
129 
130       data->this_cp = c & 0x07;
131       data->bytes_total = 4;
132       data->bytes_remaining = 3;
133     }
134 
135     else if(c >= 0xf8 && c < 0xfc) {
136       if(data->bytes_remaining)
137         cp[(*cpi)++] = UNICODE_INVALID;
138 
139       data->this_cp = c & 0x03;
140       data->bytes_total = 5;
141       data->bytes_remaining = 4;
142     }
143 
144     else if(c >= 0xfc && c < 0xfe) {
145       if(data->bytes_remaining)
146         cp[(*cpi)++] = UNICODE_INVALID;
147 
148       data->this_cp = c & 0x01;
149       data->bytes_total = 6;
150       data->bytes_remaining = 5;
151     }
152 
153     else {
154       cp[(*cpi)++] = UNICODE_INVALID;
155     }
156   }
157 }
158 
159 static VTermEncoding encoding_utf8 = {
160   &init_utf8,  // init
161   &decode_utf8 // decode
162 };
163 
decode_usascii(VTermEncoding * enc UNUSED,void * data UNUSED,uint32_t cp[],int * cpi,int cplen,const char bytes[],size_t * pos,size_t bytelen)164 static void decode_usascii(VTermEncoding *enc UNUSED, void *data UNUSED,
165                            uint32_t cp[], int *cpi, int cplen,
166                            const char bytes[], size_t *pos, size_t bytelen)
167 {
168   int is_gr = bytes[*pos] & 0x80;
169 
170   for(; *pos < bytelen && *cpi < cplen; (*pos)++) {
171     unsigned char c = bytes[*pos] ^ is_gr;
172 
173     if(c < 0x20 || c == 0x7f || c >= 0x80)
174       return;
175 
176     cp[(*cpi)++] = c;
177   }
178 }
179 
180 static VTermEncoding encoding_usascii = {
181   NULL,           // init
182   &decode_usascii // decode
183 };
184 
185 struct StaticTableEncoding {
186   const VTermEncoding enc;
187   const uint32_t chars[128];
188 };
189 
decode_table(VTermEncoding * enc,void * data UNUSED,uint32_t cp[],int * cpi,int cplen,const char bytes[],size_t * pos,size_t bytelen)190 static void decode_table(VTermEncoding *enc, void *data UNUSED,
191                          uint32_t cp[], int *cpi, int cplen,
192                          const char bytes[], size_t *pos, size_t bytelen)
193 {
194   struct StaticTableEncoding *table = (struct StaticTableEncoding *)enc;
195   int is_gr = bytes[*pos] & 0x80;
196 
197   for(; *pos < bytelen && *cpi < cplen; (*pos)++) {
198     unsigned char c = bytes[*pos] ^ is_gr;
199 
200     if(c < 0x20 || c == 0x7f || c >= 0x80)
201       return;
202 
203     if(table->chars[c])
204       cp[(*cpi)++] = table->chars[c];
205     else
206       cp[(*cpi)++] = c;
207   }
208 }
209 
210 #include "encoding/DECdrawing.inc"
211 #include "encoding/uk.inc"
212 
213 static struct {
214   VTermEncodingType type;
215   char designation;
216   VTermEncoding *enc;
217 }
218 encodings[] = {
219   { ENC_UTF8,      'u', &encoding_utf8 },
220   { ENC_SINGLE_94, '0', (VTermEncoding*)&encoding_DECdrawing },
221   { ENC_SINGLE_94, 'A', (VTermEncoding*)&encoding_uk },
222   { ENC_SINGLE_94, 'B', &encoding_usascii },
223   { 0, 0, NULL },
224 };
225 
226 /* This ought to be INTERNAL but isn't because it's used by unit testing */
vterm_lookup_encoding(VTermEncodingType type,char designation)227 VTermEncoding *vterm_lookup_encoding(VTermEncodingType type, char designation)
228 {
229   int i;
230   for(i = 0; encodings[i].designation; i++)
231     if(encodings[i].type == type && encodings[i].designation == designation)
232       return encodings[i].enc;
233   return NULL;
234 }
235