xref: /vim-8.2.3635/src/spell.h (revision 07399e7f)
1 /* vi:set ts=8 sts=4 sw=4 noet:
2  *
3  * VIM - Vi IMproved	by Bram Moolenaar
4  *
5  * Do ":help uganda"  in Vim to read copying and usage conditions.
6  * Do ":help credits" in Vim to see a list of people who contributed.
7  * See README.txt for an overview of the Vim source code.
8  */
9 
10 /*
11  * spell.h: common code for spell checking, used by spell.c and spellfile.c.
12  */
13 
14 // Use SPELL_PRINTTREE for debugging: dump the word tree after adding a word.
15 // Only use it for small word lists!
16 #if 0
17 # define SPELL_PRINTTREE
18 #endif
19 
20 // Use SPELL_COMPRESS_ALLWAYS for debugging: compress the word tree after
21 // adding a word.  Only use it for small word lists!
22 #if 0
23 # define SPELL_COMPRESS_ALLWAYS
24 #endif
25 
26 // Use DEBUG_TRIEWALK to print the changes made in suggest_trie_walk() for a
27 // specific word.
28 #if 0
29 # define DEBUG_TRIEWALK
30 #endif
31 
32 #define MAXWLEN 254		// Assume max. word len is this many bytes.
33 				// Some places assume a word length fits in a
34 				// byte, thus it can't be above 255.
35 				// Must be >= PFD_NOTSPECIAL.
36 
37 #define MAXREGIONS 8		// Number of regions supported.
38 
39 // Type used for indexes in the word tree need to be at least 4 bytes.  If int
40 // is 8 bytes we could use something smaller, but what?
41 typedef int idx_T;
42 
43 typedef int salfirst_T;
44 
45 /*
46  * Structure used to store words and other info for one language, loaded from
47  * a .spl file.
48  * The main access is through the tree in "sl_fbyts/sl_fidxs", storing the
49  * case-folded words.  "sl_kbyts/sl_kidxs" is for keep-case words.
50  *
51  * The "byts" array stores the possible bytes in each tree node, preceded by
52  * the number of possible bytes, sorted on byte value:
53  *	<len> <byte1> <byte2> ...
54  * The "idxs" array stores the index of the child node corresponding to the
55  * byte in "byts".
56  * Exception: when the byte is zero, the word may end here and "idxs" holds
57  * the flags, region mask and affixID for the word.  There may be several
58  * zeros in sequence for alternative flag/region/affixID combinations.
59  */
60 typedef struct slang_S slang_T;
61 struct slang_S
62 {
63     slang_T	*sl_next;	// next language
64     char_u	*sl_name;	// language name "en", "en.rare", "nl", etc.
65     char_u	*sl_fname;	// name of .spl file
66     int		sl_add;		// TRUE if it's a .add file.
67 
68     char_u	*sl_fbyts;	// case-folded word bytes
69     long	sl_fbyts_len;	// length of sl_fbyts
70     idx_T	*sl_fidxs;	// case-folded word indexes
71     char_u	*sl_kbyts;	// keep-case word bytes
72     idx_T	*sl_kidxs;	// keep-case word indexes
73     char_u	*sl_pbyts;	// prefix tree word bytes
74     idx_T	*sl_pidxs;	// prefix tree word indexes
75 
76     char_u	*sl_info;	// infotext string or NULL
77 
78     char_u	sl_regions[MAXREGIONS * 2 + 1];
79 				// table with up to 8 region names plus NUL
80 
81     char_u	*sl_midword;	// MIDWORD string or NULL
82 
83     hashtab_T	sl_wordcount;	// hashtable with word count, wordcount_T
84 
85     int		sl_compmax;	// COMPOUNDWORDMAX (default: MAXWLEN)
86     int		sl_compminlen;	// COMPOUNDMIN (default: 0)
87     int		sl_compsylmax;	// COMPOUNDSYLMAX (default: MAXWLEN)
88     int		sl_compoptions;	// COMP_* flags
89     garray_T	sl_comppat;	// CHECKCOMPOUNDPATTERN items
90     regprog_T	*sl_compprog;	// COMPOUNDRULE turned into a regexp progrm
91 				// (NULL when no compounding)
92     char_u	*sl_comprules;	// all COMPOUNDRULE concatenated (or NULL)
93     char_u	*sl_compstartflags; // flags for first compound word
94     char_u	*sl_compallflags; // all flags for compound words
95     char_u	sl_nobreak;	// When TRUE: no spaces between words
96     char_u	*sl_syllable;	// SYLLABLE repeatable chars or NULL
97     garray_T	sl_syl_items;	// syllable items
98 
99     int		sl_prefixcnt;	// number of items in "sl_prefprog"
100     regprog_T	**sl_prefprog;	// table with regprogs for prefixes
101 
102     garray_T	sl_rep;		// list of fromto_T entries from REP lines
103     short	sl_rep_first[256];  // indexes where byte first appears, -1 if
104 				    // there is none
105     garray_T	sl_sal;		// list of salitem_T entries from SAL lines
106     salfirst_T	sl_sal_first[256];  // indexes where byte first appears, -1 if
107 				    // there is none
108     int		sl_followup;	// SAL followup
109     int		sl_collapse;	// SAL collapse_result
110     int		sl_rem_accents;	// SAL remove_accents
111     int		sl_sofo;	// SOFOFROM and SOFOTO instead of SAL items:
112 				// "sl_sal_first" maps chars, when has_mbyte
113 				// "sl_sal" is a list of wide char lists.
114     garray_T	sl_repsal;	// list of fromto_T entries from REPSAL lines
115     short	sl_repsal_first[256];  // sl_rep_first for REPSAL lines
116     int		sl_nosplitsugs;	// don't suggest splitting a word
117     int		sl_nocompoundsugs; // don't suggest compounding
118 
119     // Info from the .sug file.  Loaded on demand.
120     time_t	sl_sugtime;	// timestamp for .sug file
121     char_u	*sl_sbyts;	// soundfolded word bytes
122     idx_T	*sl_sidxs;	// soundfolded word indexes
123     buf_T	*sl_sugbuf;	// buffer with word number table
124     int		sl_sugloaded;	// TRUE when .sug file was loaded or failed to
125 				// load
126 
127     int		sl_has_map;	// TRUE if there is a MAP line
128     hashtab_T	sl_map_hash;	// MAP for multi-byte chars
129     int		sl_map_array[256]; // MAP for first 256 chars
130     hashtab_T	sl_sounddone;	// table with soundfolded words that have
131 				// handled, see add_sound_suggest()
132 };
133 
134 #ifdef VMS
135 # define SPL_FNAME_TMPL  "%s_%s.spl"
136 # define SPL_FNAME_ADD   "_add."
137 # define SPL_FNAME_ASCII "_ascii."
138 #else
139 # define SPL_FNAME_TMPL  "%s.%s.spl"
140 # define SPL_FNAME_ADD   ".add."
141 # define SPL_FNAME_ASCII ".ascii."
142 #endif
143 
144 // Flags used for a word.  Only the lowest byte can be used, the region byte
145 // comes above it.
146 #define WF_REGION   0x01	// region byte follows
147 #define WF_ONECAP   0x02	// word with one capital (or all capitals)
148 #define WF_ALLCAP   0x04	// word must be all capitals
149 #define WF_RARE	    0x08	// rare word
150 #define WF_BANNED   0x10	// bad word
151 #define WF_AFX	    0x20	// affix ID follows
152 #define WF_FIXCAP   0x40	// keep-case word, allcap not allowed
153 #define WF_KEEPCAP  0x80	// keep-case word
154 
155 #define WF_CAPMASK (WF_ONECAP | WF_ALLCAP | WF_KEEPCAP | WF_FIXCAP)
156 
157 // for <flags2>, shifted up one byte to be used in wn_flags
158 #define WF_HAS_AFF  0x0100	// word includes affix
159 #define WF_NEEDCOMP 0x0200	// word only valid in compound
160 #define WF_NOSUGGEST 0x0400	// word not to be suggested
161 #define WF_COMPROOT 0x0800	// already compounded word, COMPOUNDROOT
162 #define WF_NOCOMPBEF 0x1000	// no compounding before this word
163 #define WF_NOCOMPAFT 0x2000	// no compounding after this word
164 
165 // flags for <pflags>
166 #define WFP_RARE	    0x01	// rare prefix
167 #define WFP_NC		    0x02	// prefix is not combining
168 #define WFP_UP		    0x04	// to-upper prefix
169 #define WFP_COMPPERMIT	    0x08	// prefix with COMPOUNDPERMITFLAG
170 #define WFP_COMPFORBID	    0x10	// prefix with COMPOUNDFORBIDFLAG
171 
172 // Flags for postponed prefixes in "sl_pidxs".  Must be above affixID (one
173 // byte) and prefcondnr (two bytes).
174 #define WF_RAREPFX  (WFP_RARE << 24)	// rare postponed prefix
175 #define WF_PFX_NC   (WFP_NC << 24)	// non-combining postponed prefix
176 #define WF_PFX_UP   (WFP_UP << 24)	// to-upper postponed prefix
177 #define WF_PFX_COMPPERMIT (WFP_COMPPERMIT << 24) // postponed prefix with
178 						 // COMPOUNDPERMITFLAG
179 #define WF_PFX_COMPFORBID (WFP_COMPFORBID << 24) // postponed prefix with
180 						 // COMPOUNDFORBIDFLAG
181 
182 // flags for <compoptions>
183 #define COMP_CHECKDUP		1	// CHECKCOMPOUNDDUP
184 #define COMP_CHECKREP		2	// CHECKCOMPOUNDREP
185 #define COMP_CHECKCASE		4	// CHECKCOMPOUNDCASE
186 #define COMP_CHECKTRIPLE	8	// CHECKCOMPOUNDTRIPLE
187 
188 // Info from "REP", "REPSAL" and "SAL" entries in ".aff" file used in si_rep,
189 // si_repsal, sl_rep, and si_sal.  Not for sl_sal!
190 // One replacement: from "ft_from" to "ft_to".
191 typedef struct fromto_S
192 {
193     char_u	*ft_from;
194     char_u	*ft_to;
195 } fromto_T;
196 
197 // Info from "SAL" entries in ".aff" file used in sl_sal.
198 // The info is split for quick processing by spell_soundfold().
199 // Note that "sm_oneof" and "sm_rules" point into sm_lead.
200 typedef struct salitem_S
201 {
202     char_u	*sm_lead;	// leading letters
203     int		sm_leadlen;	// length of "sm_lead"
204     char_u	*sm_oneof;	// letters from () or NULL
205     char_u	*sm_rules;	// rules like ^, $, priority
206     char_u	*sm_to;		// replacement.
207     int		*sm_lead_w;	// wide character copy of "sm_lead"
208     int		*sm_oneof_w;	// wide character copy of "sm_oneof"
209     int		*sm_to_w;	// wide character copy of "sm_to"
210 } salitem_T;
211 
212 // Values for SP_*ERROR are negative, positive values are used by
213 // read_cnt_string().
214 #define	SP_TRUNCERROR	-1	// spell file truncated error
215 #define	SP_FORMERROR	-2	// format error in spell file
216 #define SP_OTHERERROR	-3	// other error while reading spell file
217 
218 /*
219  * Structure used in "b_langp", filled from 'spelllang'.
220  */
221 typedef struct langp_S
222 {
223     slang_T	*lp_slang;	// info for this language
224     slang_T	*lp_sallang;	// language used for sound folding or NULL
225     slang_T	*lp_replang;	// language used for REP items or NULL
226     int		lp_region;	// bitmask for region or REGION_ALL
227 } langp_T;
228 
229 #define LANGP_ENTRY(ga, i)	(((langp_T *)(ga).ga_data) + (i))
230 
231 #define VIMSUGMAGIC "VIMsug"	// string at start of Vim .sug file
232 #define VIMSUGMAGICL 6
233 #define VIMSUGVERSION 1
234 
235 /*
236  * The tables used for recognizing word characters according to spelling.
237  * These are only used for the first 256 characters of 'encoding'.
238  */
239 typedef struct spelltab_S
240 {
241     char_u  st_isw[256];	// flags: is word char
242     char_u  st_isu[256];	// flags: is uppercase char
243     char_u  st_fold[256];	// chars: folded case
244     char_u  st_upper[256];	// chars: upper case
245 } spelltab_T;
246 
247 /*
248  * Use our own character-case definitions, because the current locale may
249  * differ from what the .spl file uses.
250  * These must not be called with negative number!
251  */
252 #if defined(HAVE_WCHAR_H)
253 # include <wchar.h>	    // for towupper() and towlower()
254 #endif
255 // Multi-byte implementation.  For Unicode we can call utf_*(), but don't do
256 // that for ASCII, because we don't want to use 'casemap' here.  Otherwise use
257 // the "w" library function for characters above 255 if available.
258 #ifdef HAVE_TOWLOWER
259 # define SPELL_TOFOLD(c) (enc_utf8 && (c) >= 128 ? utf_fold(c) \
260 	    : (c) < 256 ? (int)spelltab.st_fold[c] : (int)towlower(c))
261 #else
262 # define SPELL_TOFOLD(c) (enc_utf8 && (c) >= 128 ? utf_fold(c) \
263 	    : (c) < 256 ? (int)spelltab.st_fold[c] : (c))
264 #endif
265 
266 #ifdef HAVE_TOWUPPER
267 # define SPELL_TOUPPER(c) (enc_utf8 && (c) >= 128 ? utf_toupper(c) \
268 	    : (c) < 256 ? (int)spelltab.st_upper[c] : (int)towupper(c))
269 #else
270 # define SPELL_TOUPPER(c) (enc_utf8 && (c) >= 128 ? utf_toupper(c) \
271 	    : (c) < 256 ? (int)spelltab.st_upper[c] : (c))
272 #endif
273 
274 #ifdef HAVE_ISWUPPER
275 # define SPELL_ISUPPER(c) (enc_utf8 && (c) >= 128 ? utf_isupper(c) \
276 	    : (c) < 256 ? spelltab.st_isu[c] : iswupper(c))
277 #else
278 # define SPELL_ISUPPER(c) (enc_utf8 && (c) >= 128 ? utf_isupper(c) \
279 	    : (c) < 256 ? spelltab.st_isu[c] : (FALSE))
280 #endif
281 
282 #ifdef FEAT_SPELL
283 // First language that is loaded, start of the linked list of loaded
284 // languages.
285 # ifdef IN_SPELL_C
286 #  define SPELL_EXTERN
287 #  define SPELL_INIT(x) x
288 # else
289 #  define SPELL_EXTERN extern
290 #  define SPELL_INIT(x)
291 # endif
292 
293 SPELL_EXTERN slang_T	*first_lang SPELL_INIT(= NULL);
294 
295 // file used for "zG" and "zW"
296 SPELL_EXTERN char_u	*int_wordlist SPELL_INIT(= NULL);
297 
298 
299 SPELL_EXTERN char e_format[] SPELL_INIT(= N_("E759: Format error in spell file"));
300 
301 SPELL_EXTERN spelltab_T   spelltab;
302 SPELL_EXTERN int	  did_set_spelltab;
303 
304 // Values for "what" argument of spell_add_word()
305 #define SPELL_ADD_GOOD	0
306 #define SPELL_ADD_BAD	1
307 #define SPELL_ADD_RARE	2
308 
309 typedef struct wordcount_S
310 {
311     short_u	wc_count;	    // nr of times word was seen
312     char_u	wc_word[1];	    // word, actually longer
313 } wordcount_T;
314 
315 #define WC_KEY_OFF  offsetof(wordcount_T, wc_word)
316 #define HI2WC(hi)     ((wordcount_T *)((hi)->hi_key - WC_KEY_OFF))
317 #define MAXWORDCOUNT 0xffff
318 
319 // Remember what "z?" replaced.
320 SPELL_EXTERN char_u	*repl_from SPELL_INIT(= NULL);
321 SPELL_EXTERN char_u	*repl_to SPELL_INIT(= NULL);
322 #endif
323