xref: /vim-8.2.3635/src/arabic.c (revision eeec2548)
1 /* vi:set ts=8 sts=4 sw=4 noet:
2  *
3  * VIM - Vi IMproved    by Bram Moolenaar
4  *
5  * Do ":help uganda"  in Vim to read copying and usage conditions.
6  * Do ":help credits" in Vim to see a list of people who contributed.
7  * See README.txt for an overview of the Vim source code.
8  */
9 
10 /*
11  * arabic.c: functions for Arabic language
12  *
13  * Author: Nadim Shaikli & Isam Bayazidi
14  * Farsi support and restructuring to make adding new letters easier by Ali
15  * Gholami Rudi.  Further work by Ameretat Reith.
16  */
17 
18 /*
19  * Sorted list of unicode Arabic characters.  Each entry holds the
20  * presentation forms of a letter.
21  *
22  * Arabic characters are categorized into following types:
23  *
24  * Isolated	- iso-8859-6 form
25  * Initial	- unicode form-B start
26  * Medial	- unicode form-B middle
27  * Final	- unicode form-B final
28  * Stand-Alone	- unicode form-B isolated
29  */
30 
31 #include "vim.h"
32 
33 #if defined(FEAT_ARABIC) || defined(PROTO)
34 
35 // Unicode values for Arabic characters.
36 #define a_HAMZA				0x0621
37 #define a_ALEF_MADDA			0x0622
38 #define a_ALEF_HAMZA_ABOVE		0x0623
39 #define a_WAW_HAMZA			0x0624
40 #define a_ALEF_HAMZA_BELOW		0x0625
41 #define a_YEH_HAMZA			0x0626
42 #define a_ALEF				0x0627
43 #define a_BEH				0x0628
44 #define a_TEH_MARBUTA			0x0629
45 #define a_TEH				0x062a
46 #define a_THEH				0x062b
47 #define a_JEEM				0x062c
48 #define a_HAH				0x062d
49 #define a_KHAH				0x062e
50 #define a_DAL				0x062f
51 #define a_THAL				0x0630
52 #define a_REH				0x0631
53 #define a_ZAIN				0x0632
54 #define a_SEEN				0x0633
55 #define a_SHEEN				0x0634
56 #define a_SAD				0x0635
57 #define a_DAD				0x0636
58 #define a_TAH				0x0637
59 #define a_ZAH				0x0638
60 #define a_AIN				0x0639
61 #define a_GHAIN				0x063a
62 #define a_TATWEEL			0x0640
63 #define a_FEH				0x0641
64 #define a_QAF				0x0642
65 #define a_KAF				0x0643
66 #define a_LAM				0x0644
67 #define a_MEEM				0x0645
68 #define a_NOON				0x0646
69 #define a_HEH				0x0647
70 #define a_WAW				0x0648
71 #define a_ALEF_MAKSURA			0x0649
72 #define a_YEH				0x064a
73 #define a_FATHATAN			0x064b
74 #define a_DAMMATAN			0x064c
75 #define a_KASRATAN			0x064d
76 #define a_FATHA				0x064e
77 #define a_DAMMA				0x064f
78 #define a_KASRA				0x0650
79 #define a_SHADDA			0x0651
80 #define a_SUKUN				0x0652
81 #define a_MADDA_ABOVE			0x0653
82 #define a_HAMZA_ABOVE			0x0654
83 #define a_HAMZA_BELOW			0x0655
84 
85 #define a_PEH				0x067e
86 #define a_TCHEH				0x0686
87 #define a_JEH				0x0698
88 #define a_FKAF				0x06a9
89 #define a_GAF				0x06af
90 #define a_FYEH				0x06cc
91 
92 #define a_s_LAM_ALEF_MADDA_ABOVE	0xfef5
93 #define a_f_LAM_ALEF_MADDA_ABOVE	0xfef6
94 #define a_s_LAM_ALEF_HAMZA_ABOVE	0xfef7
95 #define a_f_LAM_ALEF_HAMZA_ABOVE	0xfef8
96 #define a_s_LAM_ALEF_HAMZA_BELOW	0xfef9
97 #define a_f_LAM_ALEF_HAMZA_BELOW	0xfefa
98 #define a_s_LAM_ALEF			0xfefb
99 #define a_f_LAM_ALEF			0xfefc
100 
101 static struct achar {
102     unsigned c;
103     unsigned isolated;
104     unsigned initial;
105     unsigned medial;
106     unsigned final;
107 } achars[] = {
108     {a_HAMZA, 0xfe80, 0, 0, 0},
109     {a_ALEF_MADDA, 0xfe81, 0, 0, 0xfe82},
110     {a_ALEF_HAMZA_ABOVE, 0xfe83, 0, 0, 0xfe84},
111     {a_WAW_HAMZA, 0xfe85, 0, 0, 0xfe86},
112     {a_ALEF_HAMZA_BELOW, 0xfe87, 0, 0, 0xfe88},
113     {a_YEH_HAMZA, 0xfe89, 0xfe8b, 0xfe8c, 0xfe8a},
114     {a_ALEF, 0xfe8d, 0, 0, 0xfe8e},
115     {a_BEH, 0xfe8f, 0xfe91, 0xfe92, 0xfe90},
116     {a_TEH_MARBUTA, 0xfe93, 0, 0, 0xfe94},
117     {a_TEH, 0xfe95, 0xfe97, 0xfe98, 0xfe96},
118     {a_THEH, 0xfe99, 0xfe9b, 0xfe9c, 0xfe9a},
119     {a_JEEM, 0xfe9d, 0xfe9f, 0xfea0, 0xfe9e},
120     {a_HAH, 0xfea1, 0xfea3, 0xfea4, 0xfea2},
121     {a_KHAH, 0xfea5, 0xfea7, 0xfea8, 0xfea6},
122     {a_DAL, 0xfea9, 0, 0, 0xfeaa},
123     {a_THAL, 0xfeab, 0, 0, 0xfeac},
124     {a_REH, 0xfead, 0, 0, 0xfeae},
125     {a_ZAIN, 0xfeaf, 0, 0, 0xfeb0},
126     {a_SEEN, 0xfeb1, 0xfeb3, 0xfeb4, 0xfeb2},
127     {a_SHEEN, 0xfeb5, 0xfeb7, 0xfeb8, 0xfeb6},
128     {a_SAD, 0xfeb9, 0xfebb, 0xfebc, 0xfeba},
129     {a_DAD, 0xfebd, 0xfebf, 0xfec0, 0xfebe},
130     {a_TAH, 0xfec1, 0xfec3, 0xfec4, 0xfec2},
131     {a_ZAH, 0xfec5, 0xfec7, 0xfec8, 0xfec6},
132     {a_AIN, 0xfec9, 0xfecb, 0xfecc, 0xfeca},
133     {a_GHAIN, 0xfecd, 0xfecf, 0xfed0, 0xfece},
134     {a_TATWEEL, 0, 0x0640, 0x0640, 0x0640},
135     {a_FEH, 0xfed1, 0xfed3, 0xfed4, 0xfed2},
136     {a_QAF, 0xfed5, 0xfed7, 0xfed8, 0xfed6},
137     {a_KAF, 0xfed9, 0xfedb, 0xfedc, 0xfeda},
138     {a_LAM, 0xfedd, 0xfedf, 0xfee0, 0xfede},
139     {a_MEEM, 0xfee1, 0xfee3, 0xfee4, 0xfee2},
140     {a_NOON, 0xfee5, 0xfee7, 0xfee8, 0xfee6},
141     {a_HEH, 0xfee9, 0xfeeb, 0xfeec, 0xfeea},
142     {a_WAW, 0xfeed, 0, 0, 0xfeee},
143     {a_ALEF_MAKSURA, 0xfeef, 0, 0, 0xfef0},
144     {a_YEH, 0xfef1, 0xfef3, 0xfef4, 0xfef2},
145     {a_FATHATAN, 0xfe70, 0, 0, 0},
146     {a_DAMMATAN, 0xfe72, 0, 0, 0},
147     {a_KASRATAN, 0xfe74, 0, 0, 0},
148     {a_FATHA, 0xfe76, 0, 0xfe77, 0},
149     {a_DAMMA, 0xfe78, 0, 0xfe79, 0},
150     {a_KASRA, 0xfe7a, 0, 0xfe7b, 0},
151     {a_SHADDA, 0xfe7c, 0, 0xfe7c, 0},
152     {a_SUKUN, 0xfe7e, 0, 0xfe7f, 0},
153     {a_MADDA_ABOVE, 0, 0, 0, 0},
154     {a_HAMZA_ABOVE, 0, 0, 0, 0},
155     {a_HAMZA_BELOW, 0, 0, 0, 0},
156     {a_PEH, 0xfb56, 0xfb58, 0xfb59, 0xfb57},
157     {a_TCHEH, 0xfb7a, 0xfb7c, 0xfb7d, 0xfb7b},
158     {a_JEH, 0xfb8a, 0, 0, 0xfb8b},
159     {a_FKAF, 0xfb8e, 0xfb90, 0xfb91, 0xfb8f},
160     {a_GAF, 0xfb92, 0xfb94, 0xfb95, 0xfb93},
161     {a_FYEH, 0xfbfc, 0xfbfe, 0xfbff, 0xfbfd},
162 };
163 
164 #define a_BYTE_ORDER_MARK		0xfeff
165 
166 /*
167  * Find the struct achar pointer to the given Arabic char.
168  * Returns NULL if not found.
169  */
170     static struct achar *
find_achar(int c)171 find_achar(int c)
172 {
173     int h, m, l;
174 
175     // using binary search to find c
176     h = ARRAY_LENGTH(achars);
177     l = 0;
178     while (l < h)
179     {
180 	m = (h + l) / 2;
181 	if (achars[m].c == (unsigned)c)
182 	    return &achars[m];
183 	if ((unsigned)c < achars[m].c)
184 	    h = m;
185 	else
186 	    l = m + 1;
187     }
188     return NULL;
189 }
190 
191 /*
192  * Change shape - from Combination (2 char) to an Isolated
193  */
194     static int
chg_c_laa2i(int hid_c)195 chg_c_laa2i(int hid_c)
196 {
197     int tempc;
198 
199     switch (hid_c)
200     {
201 	case a_ALEF_MADDA:
202 	    tempc = a_s_LAM_ALEF_MADDA_ABOVE;
203 	    break;
204 	case a_ALEF_HAMZA_ABOVE:
205 	    tempc = a_s_LAM_ALEF_HAMZA_ABOVE;
206 	    break;
207 	case a_ALEF_HAMZA_BELOW:
208 	    tempc = a_s_LAM_ALEF_HAMZA_BELOW;
209 	    break;
210 	case a_ALEF:
211 	    tempc = a_s_LAM_ALEF;
212 	    break;
213 	default:
214 	    tempc = 0;
215     }
216 
217     return tempc;
218 }
219 
220 /*
221  * Change shape - from Combination-Isolated to Final
222  */
223     static int
chg_c_laa2f(int hid_c)224 chg_c_laa2f(int hid_c)
225 {
226     int tempc;
227 
228     switch (hid_c)
229     {
230 	case a_ALEF_MADDA:
231 	    tempc = a_f_LAM_ALEF_MADDA_ABOVE;
232 	    break;
233 	case a_ALEF_HAMZA_ABOVE:
234 	    tempc = a_f_LAM_ALEF_HAMZA_ABOVE;
235 	    break;
236 	case a_ALEF_HAMZA_BELOW:
237 	    tempc = a_f_LAM_ALEF_HAMZA_BELOW;
238 	    break;
239 	case a_ALEF:
240 	    tempc = a_f_LAM_ALEF;
241 	    break;
242 	default:
243 	    tempc = 0;
244     }
245 
246     return tempc;
247 }
248 
249 /*
250  * Returns whether it is possible to join the given letters
251  */
252     static int
can_join(int c1,int c2)253 can_join(int c1, int c2)
254 {
255     struct achar *a1 = find_achar(c1);
256     struct achar *a2 = find_achar(c2);
257 
258     return a1 && a2 && (a1->initial || a1->medial) && (a2->final || a2->medial);
259 }
260 
261 /*
262  * Check whether we are dealing with a character that could be regarded as an
263  * Arabic combining character, need to check the character before this.
264  */
265     int
arabic_maycombine(int two)266 arabic_maycombine(int two)
267 {
268     if (p_arshape && !p_tbidi)
269 	return (two == a_ALEF_MADDA
270 		    || two == a_ALEF_HAMZA_ABOVE
271 		    || two == a_ALEF_HAMZA_BELOW
272 		    || two == a_ALEF);
273     return FALSE;
274 }
275 
276 /*
277  * Check whether we are dealing with Arabic combining characters.
278  * Note: these are NOT really composing characters!
279  */
280     int
arabic_combine(int one,int two)281 arabic_combine(
282     int		one,	    // first character
283     int		two)	    // character just after "one"
284 {
285     if (one == a_LAM)
286 	return arabic_maycombine(two);
287     return FALSE;
288 }
289 
290 /*
291  * A_is_iso returns true if 'c' is an Arabic ISO-8859-6 character
292  *		(alphabet/number/punctuation)
293  */
294     static int
A_is_iso(int c)295 A_is_iso(int c)
296 {
297     return find_achar(c) != NULL;
298 }
299 
300 /*
301  * A_is_ok returns true if 'c' is an Arabic 10646 (8859-6 or Form-B)
302  */
303     static int
A_is_ok(int c)304 A_is_ok(int c)
305 {
306     return (A_is_iso(c) || c == a_BYTE_ORDER_MARK);
307 }
308 
309 /*
310  * A_is_valid returns true if 'c' is an Arabic 10646 (8859-6 or Form-B)
311  *		with some exceptions/exclusions
312  */
313     static int
A_is_valid(int c)314 A_is_valid(int c)
315 {
316     return (A_is_ok(c) && c != a_HAMZA);
317 }
318 
319 /*
320  * Do Arabic shaping on character "c".  Returns the shaped character.
321  * out:    "ccp" points to the first byte of the character to be shaped.
322  * in/out: "c1p" points to the first composing char for "c".
323  * in:     "prev_c"  is the previous character (not shaped)
324  * in:     "prev_c1" is the first composing char for the previous char
325  *		     (not shaped)
326  * in:     "next_c"  is the next character (not shaped).
327  */
328     int
arabic_shape(int c,int * ccp,int * c1p,int prev_c,int prev_c1,int next_c)329 arabic_shape(
330     int		c,
331     int		*ccp,
332     int		*c1p,
333     int		prev_c,
334     int		prev_c1,
335     int		next_c)
336 {
337     int		curr_c;
338     int		curr_laa;
339     int		prev_laa;
340 
341     // Deal only with Arabic characters, pass back all others
342     if (!A_is_ok(c))
343 	return c;
344 
345     curr_laa = arabic_combine(c, *c1p);
346     prev_laa = arabic_combine(prev_c, prev_c1);
347 
348     if (curr_laa)
349     {
350 	if (A_is_valid(prev_c) && can_join(prev_c, a_LAM) && !prev_laa)
351 	    curr_c = chg_c_laa2f(*c1p);
352 	else
353 	    curr_c = chg_c_laa2i(*c1p);
354 
355 	// Remove the composing character
356 	*c1p = 0;
357     }
358     else
359     {
360 	struct achar *curr_a = find_achar(c);
361 	int backward_combine = !prev_laa && can_join(prev_c, c);
362 	int forward_combine = can_join(c, next_c);
363 
364 	if (backward_combine)
365 	{
366 	    if (forward_combine)
367 		curr_c = curr_a->medial;
368 	    else
369 		curr_c = curr_a->final;
370 	}
371 	else
372 	{
373 	    if (forward_combine)
374 		curr_c = curr_a->initial;
375 	    else
376 		curr_c = curr_a->isolated;
377 	}
378     }
379 
380     // Character missing from the table means using original character.
381     if (curr_c == NUL)
382 	curr_c = c;
383 
384     if (curr_c != c && ccp != NULL)
385     {
386 	char_u buf[MB_MAXBYTES + 1];
387 
388 	// Update the first byte of the character.
389 	(*mb_char2bytes)(curr_c, buf);
390 	*ccp = buf[0];
391     }
392 
393     // Return the shaped character
394     return curr_c;
395 }
396 #endif // FEAT_ARABIC
397