xref: /vim-8.2.3635/src/spell.h (revision ba3ff539)
1 /* vi:set ts=8 sts=4 sw=4 noet:
2  *
3  * VIM - Vi IMproved	by Bram Moolenaar
4  *
5  * Do ":help uganda"  in Vim to read copying and usage conditions.
6  * Do ":help credits" in Vim to see a list of people who contributed.
7  * See README.txt for an overview of the Vim source code.
8  */
9 
10 /*
11  * spell.h: common code for spell checking, used by spell.c and spellfile.c.
12  */
13 
14 /* Use SPELL_PRINTTREE for debugging: dump the word tree after adding a word.
15  * Only use it for small word lists! */
16 #if 0
17 # define SPELL_PRINTTREE
18 #endif
19 
20 /* Use SPELL_COMPRESS_ALLWAYS for debugging: compress the word tree after
21  * adding a word.  Only use it for small word lists! */
22 #if 0
23 # define SPELL_COMPRESS_ALLWAYS
24 #endif
25 
26 /* Use DEBUG_TRIEWALK to print the changes made in suggest_trie_walk() for a
27  * specific word. */
28 #if 0
29 # define DEBUG_TRIEWALK
30 #endif
31 
32 #define MAXWLEN 254		/* Assume max. word len is this many bytes.
33 				   Some places assume a word length fits in a
34 				   byte, thus it can't be above 255.
35 				   Must be >= PFD_NOTSPECIAL. */
36 
37 #define MAXREGIONS 8		/* Number of regions supported. */
38 
39 /* Type used for indexes in the word tree need to be at least 4 bytes.  If int
40  * is 8 bytes we could use something smaller, but what? */
41 #if VIM_SIZEOF_INT > 3
42 typedef int idx_T;
43 #else
44 typedef long idx_T;
45 #endif
46 
47 #ifdef FEAT_MBYTE
48 typedef int salfirst_T;
49 #else
50 typedef short salfirst_T;
51 #endif
52 
53 /*
54  * Structure used to store words and other info for one language, loaded from
55  * a .spl file.
56  * The main access is through the tree in "sl_fbyts/sl_fidxs", storing the
57  * case-folded words.  "sl_kbyts/sl_kidxs" is for keep-case words.
58  *
59  * The "byts" array stores the possible bytes in each tree node, preceded by
60  * the number of possible bytes, sorted on byte value:
61  *	<len> <byte1> <byte2> ...
62  * The "idxs" array stores the index of the child node corresponding to the
63  * byte in "byts".
64  * Exception: when the byte is zero, the word may end here and "idxs" holds
65  * the flags, region mask and affixID for the word.  There may be several
66  * zeros in sequence for alternative flag/region/affixID combinations.
67  */
68 typedef struct slang_S slang_T;
69 struct slang_S
70 {
71     slang_T	*sl_next;	/* next language */
72     char_u	*sl_name;	/* language name "en", "en.rare", "nl", etc. */
73     char_u	*sl_fname;	/* name of .spl file */
74     int		sl_add;		/* TRUE if it's a .add file. */
75 
76     char_u	*sl_fbyts;	/* case-folded word bytes */
77     idx_T	*sl_fidxs;	/* case-folded word indexes */
78     char_u	*sl_kbyts;	/* keep-case word bytes */
79     idx_T	*sl_kidxs;	/* keep-case word indexes */
80     char_u	*sl_pbyts;	/* prefix tree word bytes */
81     idx_T	*sl_pidxs;	/* prefix tree word indexes */
82 
83     char_u	*sl_info;	/* infotext string or NULL */
84 
85     char_u	sl_regions[MAXREGIONS * 2 + 1];
86 				/* table with up to 8 region names plus NUL */
87 
88     char_u	*sl_midword;	/* MIDWORD string or NULL */
89 
90     hashtab_T	sl_wordcount;	/* hashtable with word count, wordcount_T */
91 
92     int		sl_compmax;	/* COMPOUNDWORDMAX (default: MAXWLEN) */
93     int		sl_compminlen;	/* COMPOUNDMIN (default: 0) */
94     int		sl_compsylmax;	/* COMPOUNDSYLMAX (default: MAXWLEN) */
95     int		sl_compoptions;	/* COMP_* flags */
96     garray_T	sl_comppat;	/* CHECKCOMPOUNDPATTERN items */
97     regprog_T	*sl_compprog;	/* COMPOUNDRULE turned into a regexp progrm
98 				 * (NULL when no compounding) */
99     char_u	*sl_comprules;	/* all COMPOUNDRULE concatenated (or NULL) */
100     char_u	*sl_compstartflags; /* flags for first compound word */
101     char_u	*sl_compallflags; /* all flags for compound words */
102     char_u	sl_nobreak;	/* When TRUE: no spaces between words */
103     char_u	*sl_syllable;	/* SYLLABLE repeatable chars or NULL */
104     garray_T	sl_syl_items;	/* syllable items */
105 
106     int		sl_prefixcnt;	/* number of items in "sl_prefprog" */
107     regprog_T	**sl_prefprog;	/* table with regprogs for prefixes */
108 
109     garray_T	sl_rep;		/* list of fromto_T entries from REP lines */
110     short	sl_rep_first[256];  /* indexes where byte first appears, -1 if
111 				       there is none */
112     garray_T	sl_sal;		/* list of salitem_T entries from SAL lines */
113     salfirst_T	sl_sal_first[256];  /* indexes where byte first appears, -1 if
114 				       there is none */
115     int		sl_followup;	/* SAL followup */
116     int		sl_collapse;	/* SAL collapse_result */
117     int		sl_rem_accents;	/* SAL remove_accents */
118     int		sl_sofo;	/* SOFOFROM and SOFOTO instead of SAL items:
119 				 * "sl_sal_first" maps chars, when has_mbyte
120 				 * "sl_sal" is a list of wide char lists. */
121     garray_T	sl_repsal;	/* list of fromto_T entries from REPSAL lines */
122     short	sl_repsal_first[256];  /* sl_rep_first for REPSAL lines */
123     int		sl_nosplitsugs;	/* don't suggest splitting a word */
124     int		sl_nocompoundsugs; /* don't suggest compounding */
125 
126     /* Info from the .sug file.  Loaded on demand. */
127     time_t	sl_sugtime;	/* timestamp for .sug file */
128     char_u	*sl_sbyts;	/* soundfolded word bytes */
129     idx_T	*sl_sidxs;	/* soundfolded word indexes */
130     buf_T	*sl_sugbuf;	/* buffer with word number table */
131     int		sl_sugloaded;	/* TRUE when .sug file was loaded or failed to
132 				   load */
133 
134     int		sl_has_map;	/* TRUE if there is a MAP line */
135 #ifdef FEAT_MBYTE
136     hashtab_T	sl_map_hash;	/* MAP for multi-byte chars */
137     int		sl_map_array[256]; /* MAP for first 256 chars */
138 #else
139     char_u	sl_map_array[256]; /* MAP for first 256 chars */
140 #endif
141     hashtab_T	sl_sounddone;	/* table with soundfolded words that have
142 				   handled, see add_sound_suggest() */
143 };
144 
145 #ifdef VMS
146 # define SPL_FNAME_TMPL  "%s_%s.spl"
147 # define SPL_FNAME_ADD   "_add."
148 # define SPL_FNAME_ASCII "_ascii."
149 #else
150 # define SPL_FNAME_TMPL  "%s.%s.spl"
151 # define SPL_FNAME_ADD   ".add."
152 # define SPL_FNAME_ASCII ".ascii."
153 #endif
154 
155 /* Flags used for a word.  Only the lowest byte can be used, the region byte
156  * comes above it. */
157 #define WF_REGION   0x01	/* region byte follows */
158 #define WF_ONECAP   0x02	/* word with one capital (or all capitals) */
159 #define WF_ALLCAP   0x04	/* word must be all capitals */
160 #define WF_RARE	    0x08	/* rare word */
161 #define WF_BANNED   0x10	/* bad word */
162 #define WF_AFX	    0x20	/* affix ID follows */
163 #define WF_FIXCAP   0x40	/* keep-case word, allcap not allowed */
164 #define WF_KEEPCAP  0x80	/* keep-case word */
165 
166 /* for <flags2>, shifted up one byte to be used in wn_flags */
167 #define WF_HAS_AFF  0x0100	/* word includes affix */
168 #define WF_NEEDCOMP 0x0200	/* word only valid in compound */
169 #define WF_NOSUGGEST 0x0400	/* word not to be suggested */
170 #define WF_COMPROOT 0x0800	/* already compounded word, COMPOUNDROOT */
171 #define WF_NOCOMPBEF 0x1000	/* no compounding before this word */
172 #define WF_NOCOMPAFT 0x2000	/* no compounding after this word */
173 
174 /* flags for <pflags> */
175 #define WFP_RARE	    0x01	/* rare prefix */
176 #define WFP_NC		    0x02	/* prefix is not combining */
177 #define WFP_UP		    0x04	/* to-upper prefix */
178 #define WFP_COMPPERMIT	    0x08	/* prefix with COMPOUNDPERMITFLAG */
179 #define WFP_COMPFORBID	    0x10	/* prefix with COMPOUNDFORBIDFLAG */
180 
181 /* Flags for postponed prefixes in "sl_pidxs".  Must be above affixID (one
182  * byte) and prefcondnr (two bytes). */
183 #define WF_RAREPFX  (WFP_RARE << 24)	/* rare postponed prefix */
184 #define WF_PFX_NC   (WFP_NC << 24)	/* non-combining postponed prefix */
185 #define WF_PFX_UP   (WFP_UP << 24)	/* to-upper postponed prefix */
186 #define WF_PFX_COMPPERMIT (WFP_COMPPERMIT << 24) /* postponed prefix with
187 						  * COMPOUNDPERMITFLAG */
188 #define WF_PFX_COMPFORBID (WFP_COMPFORBID << 24) /* postponed prefix with
189 						  * COMPOUNDFORBIDFLAG */
190 
191 /* flags for <compoptions> */
192 #define COMP_CHECKDUP		1	/* CHECKCOMPOUNDDUP */
193 #define COMP_CHECKREP		2	/* CHECKCOMPOUNDREP */
194 #define COMP_CHECKCASE		4	/* CHECKCOMPOUNDCASE */
195 #define COMP_CHECKTRIPLE	8	/* CHECKCOMPOUNDTRIPLE */
196 
197 /* Info from "REP", "REPSAL" and "SAL" entries in ".aff" file used in si_rep,
198  * si_repsal, sl_rep, and si_sal.  Not for sl_sal!
199  * One replacement: from "ft_from" to "ft_to". */
200 typedef struct fromto_S
201 {
202     char_u	*ft_from;
203     char_u	*ft_to;
204 } fromto_T;
205 
206 /* Info from "SAL" entries in ".aff" file used in sl_sal.
207  * The info is split for quick processing by spell_soundfold().
208  * Note that "sm_oneof" and "sm_rules" point into sm_lead. */
209 typedef struct salitem_S
210 {
211     char_u	*sm_lead;	/* leading letters */
212     int		sm_leadlen;	/* length of "sm_lead" */
213     char_u	*sm_oneof;	/* letters from () or NULL */
214     char_u	*sm_rules;	/* rules like ^, $, priority */
215     char_u	*sm_to;		/* replacement. */
216 #ifdef FEAT_MBYTE
217     int		*sm_lead_w;	/* wide character copy of "sm_lead" */
218     int		*sm_oneof_w;	/* wide character copy of "sm_oneof" */
219     int		*sm_to_w;	/* wide character copy of "sm_to" */
220 #endif
221 } salitem_T;
222 
223 /* Values for SP_*ERROR are negative, positive values are used by
224  * read_cnt_string(). */
225 #define	SP_TRUNCERROR	-1	/* spell file truncated error */
226 #define	SP_FORMERROR	-2	/* format error in spell file */
227 #define SP_OTHERERROR	-3	/* other error while reading spell file */
228 
229 /*
230  * Structure used in "b_langp", filled from 'spelllang'.
231  */
232 typedef struct langp_S
233 {
234     slang_T	*lp_slang;	/* info for this language */
235     slang_T	*lp_sallang;	/* language used for sound folding or NULL */
236     slang_T	*lp_replang;	/* language used for REP items or NULL */
237     int		lp_region;	/* bitmask for region or REGION_ALL */
238 } langp_T;
239 
240 #define LANGP_ENTRY(ga, i)	(((langp_T *)(ga).ga_data) + (i))
241 
242 #define VIMSUGMAGIC "VIMsug"	/* string at start of Vim .sug file */
243 #define VIMSUGMAGICL 6
244 #define VIMSUGVERSION 1
245 
246 /*
247  * The tables used for recognizing word characters according to spelling.
248  * These are only used for the first 256 characters of 'encoding'.
249  */
250 typedef struct spelltab_S
251 {
252     char_u  st_isw[256];	/* flags: is word char */
253     char_u  st_isu[256];	/* flags: is uppercase char */
254     char_u  st_fold[256];	/* chars: folded case */
255     char_u  st_upper[256];	/* chars: upper case */
256 } spelltab_T;
257 
258 /*
259  * Use our own character-case definitions, because the current locale may
260  * differ from what the .spl file uses.
261  * These must not be called with negative number!
262  */
263 #ifndef FEAT_MBYTE
264 /* Non-multi-byte implementation. */
265 # define SPELL_TOFOLD(c) ((c) < 256 ? (int)spelltab.st_fold[c] : (c))
266 # define SPELL_TOUPPER(c) ((c) < 256 ? (int)spelltab.st_upper[c] : (c))
267 # define SPELL_ISUPPER(c) ((c) < 256 ? spelltab.st_isu[c] : FALSE)
268 #else
269 # if defined(HAVE_WCHAR_H)
270 #  include <wchar.h>	    /* for towupper() and towlower() */
271 # endif
272 /* Multi-byte implementation.  For Unicode we can call utf_*(), but don't do
273  * that for ASCII, because we don't want to use 'casemap' here.  Otherwise use
274  * the "w" library function for characters above 255 if available. */
275 # ifdef HAVE_TOWLOWER
276 #  define SPELL_TOFOLD(c) (enc_utf8 && (c) >= 128 ? utf_fold(c) \
277 	    : (c) < 256 ? (int)spelltab.st_fold[c] : (int)towlower(c))
278 # else
279 #  define SPELL_TOFOLD(c) (enc_utf8 && (c) >= 128 ? utf_fold(c) \
280 	    : (c) < 256 ? (int)spelltab.st_fold[c] : (c))
281 # endif
282 
283 # ifdef HAVE_TOWUPPER
284 #  define SPELL_TOUPPER(c) (enc_utf8 && (c) >= 128 ? utf_toupper(c) \
285 	    : (c) < 256 ? (int)spelltab.st_upper[c] : (int)towupper(c))
286 # else
287 #  define SPELL_TOUPPER(c) (enc_utf8 && (c) >= 128 ? utf_toupper(c) \
288 	    : (c) < 256 ? (int)spelltab.st_upper[c] : (c))
289 # endif
290 
291 # ifdef HAVE_ISWUPPER
292 #  define SPELL_ISUPPER(c) (enc_utf8 && (c) >= 128 ? utf_isupper(c) \
293 	    : (c) < 256 ? spelltab.st_isu[c] : iswupper(c))
294 # else
295 #  define SPELL_ISUPPER(c) (enc_utf8 && (c) >= 128 ? utf_isupper(c) \
296 	    : (c) < 256 ? spelltab.st_isu[c] : (FALSE))
297 # endif
298 #endif
299 
300 #ifdef FEAT_SPELL
301 /* First language that is loaded, start of the linked list of loaded
302  * languages. */
303 # ifdef IN_SPELL_C
304 #  define SPELL_EXTERN
305 #  define SPELL_INIT(x) x
306 # else
307 #  define SPELL_EXTERN extern
308 #  define SPELL_INIT(x)
309 # endif
310 
311 SPELL_EXTERN slang_T	*first_lang SPELL_INIT(= NULL);
312 
313 /* file used for "zG" and "zW" */
314 SPELL_EXTERN char_u	*int_wordlist SPELL_INIT(= NULL);
315 
316 
317 SPELL_EXTERN char e_format[] SPELL_INIT(= N_("E759: Format error in spell file"));
318 
319 SPELL_EXTERN spelltab_T   spelltab;
320 SPELL_EXTERN int	  did_set_spelltab;
321 
322 #endif
323