1 /* vi:set ts=8 sts=4 sw=4 noet: 2 * 3 * VIM - Vi IMproved by Bram Moolenaar 4 * 5 * Do ":help uganda" in Vim to read copying and usage conditions. 6 * Do ":help credits" in Vim to see a list of people who contributed. 7 * See README.txt for an overview of the Vim source code. 8 */ 9 10 /* 11 * spell.c: code for spell checking 12 * 13 * See spellfile.c for the Vim spell file format. 14 * 15 * The spell checking mechanism uses a tree (aka trie). Each node in the tree 16 * has a list of bytes that can appear (siblings). For each byte there is a 17 * pointer to the node with the byte that follows in the word (child). 18 * 19 * A NUL byte is used where the word may end. The bytes are sorted, so that 20 * binary searching can be used and the NUL bytes are at the start. The 21 * number of possible bytes is stored before the list of bytes. 22 * 23 * The tree uses two arrays: "byts" stores the characters, "idxs" stores 24 * either the next index or flags. The tree starts at index 0. For example, 25 * to lookup "vi" this sequence is followed: 26 * i = 0 27 * len = byts[i] 28 * n = where "v" appears in byts[i + 1] to byts[i + len] 29 * i = idxs[n] 30 * len = byts[i] 31 * n = where "i" appears in byts[i + 1] to byts[i + len] 32 * i = idxs[n] 33 * len = byts[i] 34 * find that byts[i + 1] is 0, idxs[i + 1] has flags for "vi". 35 * 36 * There are two word trees: one with case-folded words and one with words in 37 * original case. The second one is only used for keep-case words and is 38 * usually small. 39 * 40 * There is one additional tree for when not all prefixes are applied when 41 * generating the .spl file. This tree stores all the possible prefixes, as 42 * if they were words. At each word (prefix) end the prefix nr is stored, the 43 * following word must support this prefix nr. And the condition nr is 44 * stored, used to lookup the condition that the word must match with. 45 * 46 * Thanks to Olaf Seibert for providing an example implementation of this tree 47 * and the compression mechanism. 48 * LZ trie ideas: 49 * http://www.irb.hr/hr/home/ristov/papers/RistovLZtrieRevision1.pdf 50 * More papers: http://www-igm.univ-mlv.fr/~laporte/publi_en.html 51 * 52 * Matching involves checking the caps type: Onecap ALLCAP KeepCap. 53 * 54 * Why doesn't Vim use aspell/ispell/myspell/etc.? 55 * See ":help develop-spell". 56 */ 57 58 /* 59 * Use this to adjust the score after finding suggestions, based on the 60 * suggested word sounding like the bad word. This is much faster than doing 61 * it for every possible suggestion. 62 * Disadvantage: When "the" is typed as "hte" it sounds quite different ("@" 63 * vs "ht") and goes down in the list. 64 * Used when 'spellsuggest' is set to "best". 65 */ 66 #define RESCORE(word_score, sound_score) ((3 * word_score + sound_score) / 4) 67 68 /* 69 * Do the opposite: based on a maximum end score and a known sound score, 70 * compute the maximum word score that can be used. 71 */ 72 #define MAXSCORE(word_score, sound_score) ((4 * word_score - sound_score) / 3) 73 74 #define IN_SPELL_C 75 #include "vim.h" 76 77 #if defined(FEAT_SPELL) || defined(PROTO) 78 79 #ifndef UNIX /* it's in os_unix.h for Unix */ 80 # include <time.h> /* for time_t */ 81 #endif 82 83 /* only used for su_badflags */ 84 #define WF_MIXCAP 0x20 /* mix of upper and lower case: macaRONI */ 85 86 #define WF_CAPMASK (WF_ONECAP | WF_ALLCAP | WF_KEEPCAP | WF_FIXCAP) 87 88 #define REGION_ALL 0xff /* word valid in all regions */ 89 90 #define VIMSUGMAGIC "VIMsug" /* string at start of Vim .sug file */ 91 #define VIMSUGMAGICL 6 92 #define VIMSUGVERSION 1 93 94 /* Result values. Lower number is accepted over higher one. */ 95 #define SP_BANNED -1 96 #define SP_OK 0 97 #define SP_RARE 1 98 #define SP_LOCAL 2 99 #define SP_BAD 3 100 101 typedef struct wordcount_S 102 { 103 short_u wc_count; /* nr of times word was seen */ 104 char_u wc_word[1]; /* word, actually longer */ 105 } wordcount_T; 106 107 #define WC_KEY_OFF offsetof(wordcount_T, wc_word) 108 #define HI2WC(hi) ((wordcount_T *)((hi)->hi_key - WC_KEY_OFF)) 109 #define MAXWORDCOUNT 0xffff 110 111 /* 112 * Information used when looking for suggestions. 113 */ 114 typedef struct suginfo_S 115 { 116 garray_T su_ga; /* suggestions, contains "suggest_T" */ 117 int su_maxcount; /* max. number of suggestions displayed */ 118 int su_maxscore; /* maximum score for adding to su_ga */ 119 int su_sfmaxscore; /* idem, for when doing soundfold words */ 120 garray_T su_sga; /* like su_ga, sound-folded scoring */ 121 char_u *su_badptr; /* start of bad word in line */ 122 int su_badlen; /* length of detected bad word in line */ 123 int su_badflags; /* caps flags for bad word */ 124 char_u su_badword[MAXWLEN]; /* bad word truncated at su_badlen */ 125 char_u su_fbadword[MAXWLEN]; /* su_badword case-folded */ 126 char_u su_sal_badword[MAXWLEN]; /* su_badword soundfolded */ 127 hashtab_T su_banned; /* table with banned words */ 128 slang_T *su_sallang; /* default language for sound folding */ 129 } suginfo_T; 130 131 /* One word suggestion. Used in "si_ga". */ 132 typedef struct suggest_S 133 { 134 char_u *st_word; /* suggested word, allocated string */ 135 int st_wordlen; /* STRLEN(st_word) */ 136 int st_orglen; /* length of replaced text */ 137 int st_score; /* lower is better */ 138 int st_altscore; /* used when st_score compares equal */ 139 int st_salscore; /* st_score is for soundalike */ 140 int st_had_bonus; /* bonus already included in score */ 141 slang_T *st_slang; /* language used for sound folding */ 142 } suggest_T; 143 144 #define SUG(ga, i) (((suggest_T *)(ga).ga_data)[i]) 145 146 /* TRUE if a word appears in the list of banned words. */ 147 #define WAS_BANNED(su, word) (!HASHITEM_EMPTY(hash_find(&su->su_banned, word))) 148 149 /* Number of suggestions kept when cleaning up. We need to keep more than 150 * what is displayed, because when rescore_suggestions() is called the score 151 * may change and wrong suggestions may be removed later. */ 152 #define SUG_CLEAN_COUNT(su) ((su)->su_maxcount < 130 ? 150 : (su)->su_maxcount + 20) 153 154 /* Threshold for sorting and cleaning up suggestions. Don't want to keep lots 155 * of suggestions that are not going to be displayed. */ 156 #define SUG_MAX_COUNT(su) (SUG_CLEAN_COUNT(su) + 50) 157 158 /* score for various changes */ 159 #define SCORE_SPLIT 149 /* split bad word */ 160 #define SCORE_SPLIT_NO 249 /* split bad word with NOSPLITSUGS */ 161 #define SCORE_ICASE 52 /* slightly different case */ 162 #define SCORE_REGION 200 /* word is for different region */ 163 #define SCORE_RARE 180 /* rare word */ 164 #define SCORE_SWAP 75 /* swap two characters */ 165 #define SCORE_SWAP3 110 /* swap two characters in three */ 166 #define SCORE_REP 65 /* REP replacement */ 167 #define SCORE_SUBST 93 /* substitute a character */ 168 #define SCORE_SIMILAR 33 /* substitute a similar character */ 169 #define SCORE_SUBCOMP 33 /* substitute a composing character */ 170 #define SCORE_DEL 94 /* delete a character */ 171 #define SCORE_DELDUP 66 /* delete a duplicated character */ 172 #define SCORE_DELCOMP 28 /* delete a composing character */ 173 #define SCORE_INS 96 /* insert a character */ 174 #define SCORE_INSDUP 67 /* insert a duplicate character */ 175 #define SCORE_INSCOMP 30 /* insert a composing character */ 176 #define SCORE_NONWORD 103 /* change non-word to word char */ 177 178 #define SCORE_FILE 30 /* suggestion from a file */ 179 #define SCORE_MAXINIT 350 /* Initial maximum score: higher == slower. 180 * 350 allows for about three changes. */ 181 182 #define SCORE_COMMON1 30 /* subtracted for words seen before */ 183 #define SCORE_COMMON2 40 /* subtracted for words often seen */ 184 #define SCORE_COMMON3 50 /* subtracted for words very often seen */ 185 #define SCORE_THRES2 10 /* word count threshold for COMMON2 */ 186 #define SCORE_THRES3 100 /* word count threshold for COMMON3 */ 187 188 /* When trying changed soundfold words it becomes slow when trying more than 189 * two changes. With less then two changes it's slightly faster but we miss a 190 * few good suggestions. In rare cases we need to try three of four changes. 191 */ 192 #define SCORE_SFMAX1 200 /* maximum score for first try */ 193 #define SCORE_SFMAX2 300 /* maximum score for second try */ 194 #define SCORE_SFMAX3 400 /* maximum score for third try */ 195 196 #define SCORE_BIG SCORE_INS * 3 /* big difference */ 197 #define SCORE_MAXMAX 999999 /* accept any score */ 198 #define SCORE_LIMITMAX 350 /* for spell_edit_score_limit() */ 199 200 /* for spell_edit_score_limit() we need to know the minimum value of 201 * SCORE_ICASE, SCORE_SWAP, SCORE_DEL, SCORE_SIMILAR and SCORE_INS */ 202 #define SCORE_EDIT_MIN SCORE_SIMILAR 203 204 /* 205 * Structure to store info for word matching. 206 */ 207 typedef struct matchinf_S 208 { 209 langp_T *mi_lp; /* info for language and region */ 210 211 /* pointers to original text to be checked */ 212 char_u *mi_word; /* start of word being checked */ 213 char_u *mi_end; /* end of matching word so far */ 214 char_u *mi_fend; /* next char to be added to mi_fword */ 215 char_u *mi_cend; /* char after what was used for 216 mi_capflags */ 217 218 /* case-folded text */ 219 char_u mi_fword[MAXWLEN + 1]; /* mi_word case-folded */ 220 int mi_fwordlen; /* nr of valid bytes in mi_fword */ 221 222 /* for when checking word after a prefix */ 223 int mi_prefarridx; /* index in sl_pidxs with list of 224 affixID/condition */ 225 int mi_prefcnt; /* number of entries at mi_prefarridx */ 226 int mi_prefixlen; /* byte length of prefix */ 227 int mi_cprefixlen; /* byte length of prefix in original 228 case */ 229 230 /* for when checking a compound word */ 231 int mi_compoff; /* start of following word offset */ 232 char_u mi_compflags[MAXWLEN]; /* flags for compound words used */ 233 int mi_complen; /* nr of compound words used */ 234 int mi_compextra; /* nr of COMPOUNDROOT words */ 235 236 /* others */ 237 int mi_result; /* result so far: SP_BAD, SP_OK, etc. */ 238 int mi_capflags; /* WF_ONECAP WF_ALLCAP WF_KEEPCAP */ 239 win_T *mi_win; /* buffer being checked */ 240 241 /* for NOBREAK */ 242 int mi_result2; /* "mi_resul" without following word */ 243 char_u *mi_end2; /* "mi_end" without following word */ 244 } matchinf_T; 245 246 247 static int spell_iswordp(char_u *p, win_T *wp); 248 static int spell_mb_isword_class(int cl, win_T *wp); 249 250 /* 251 * For finding suggestions: At each node in the tree these states are tried: 252 */ 253 typedef enum 254 { 255 STATE_START = 0, /* At start of node check for NUL bytes (goodword 256 * ends); if badword ends there is a match, otherwise 257 * try splitting word. */ 258 STATE_NOPREFIX, /* try without prefix */ 259 STATE_SPLITUNDO, /* Undo splitting. */ 260 STATE_ENDNUL, /* Past NUL bytes at start of the node. */ 261 STATE_PLAIN, /* Use each byte of the node. */ 262 STATE_DEL, /* Delete a byte from the bad word. */ 263 STATE_INS_PREP, /* Prepare for inserting bytes. */ 264 STATE_INS, /* Insert a byte in the bad word. */ 265 STATE_SWAP, /* Swap two bytes. */ 266 STATE_UNSWAP, /* Undo swap two characters. */ 267 STATE_SWAP3, /* Swap two characters over three. */ 268 STATE_UNSWAP3, /* Undo Swap two characters over three. */ 269 STATE_UNROT3L, /* Undo rotate three characters left */ 270 STATE_UNROT3R, /* Undo rotate three characters right */ 271 STATE_REP_INI, /* Prepare for using REP items. */ 272 STATE_REP, /* Use matching REP items from the .aff file. */ 273 STATE_REP_UNDO, /* Undo a REP item replacement. */ 274 STATE_FINAL /* End of this node. */ 275 } state_T; 276 277 /* 278 * Struct to keep the state at each level in suggest_try_change(). 279 */ 280 typedef struct trystate_S 281 { 282 state_T ts_state; /* state at this level, STATE_ */ 283 int ts_score; /* score */ 284 idx_T ts_arridx; /* index in tree array, start of node */ 285 short ts_curi; /* index in list of child nodes */ 286 char_u ts_fidx; /* index in fword[], case-folded bad word */ 287 char_u ts_fidxtry; /* ts_fidx at which bytes may be changed */ 288 char_u ts_twordlen; /* valid length of tword[] */ 289 char_u ts_prefixdepth; /* stack depth for end of prefix or 290 * PFD_PREFIXTREE or PFD_NOPREFIX */ 291 char_u ts_flags; /* TSF_ flags */ 292 char_u ts_tcharlen; /* number of bytes in tword character */ 293 char_u ts_tcharidx; /* current byte index in tword character */ 294 char_u ts_isdiff; /* DIFF_ values */ 295 char_u ts_fcharstart; /* index in fword where badword char started */ 296 char_u ts_prewordlen; /* length of word in "preword[]" */ 297 char_u ts_splitoff; /* index in "tword" after last split */ 298 char_u ts_splitfidx; /* "ts_fidx" at word split */ 299 char_u ts_complen; /* nr of compound words used */ 300 char_u ts_compsplit; /* index for "compflags" where word was spit */ 301 char_u ts_save_badflags; /* su_badflags saved here */ 302 char_u ts_delidx; /* index in fword for char that was deleted, 303 valid when "ts_flags" has TSF_DIDDEL */ 304 } trystate_T; 305 306 /* values for ts_isdiff */ 307 #define DIFF_NONE 0 /* no different byte (yet) */ 308 #define DIFF_YES 1 /* different byte found */ 309 #define DIFF_INSERT 2 /* inserting character */ 310 311 /* values for ts_flags */ 312 #define TSF_PREFIXOK 1 /* already checked that prefix is OK */ 313 #define TSF_DIDSPLIT 2 /* tried split at this point */ 314 #define TSF_DIDDEL 4 /* did a delete, "ts_delidx" has index */ 315 316 /* special values ts_prefixdepth */ 317 #define PFD_NOPREFIX 0xff /* not using prefixes */ 318 #define PFD_PREFIXTREE 0xfe /* walking through the prefix tree */ 319 #define PFD_NOTSPECIAL 0xfd /* highest value that's not special */ 320 321 /* mode values for find_word */ 322 #define FIND_FOLDWORD 0 /* find word case-folded */ 323 #define FIND_KEEPWORD 1 /* find keep-case word */ 324 #define FIND_PREFIX 2 /* find word after prefix */ 325 #define FIND_COMPOUND 3 /* find case-folded compound word */ 326 #define FIND_KEEPCOMPOUND 4 /* find keep-case compound word */ 327 328 static void find_word(matchinf_T *mip, int mode); 329 static int match_checkcompoundpattern(char_u *ptr, int wlen, garray_T *gap); 330 static int can_compound(slang_T *slang, char_u *word, char_u *flags); 331 static int match_compoundrule(slang_T *slang, char_u *compflags); 332 static int valid_word_prefix(int totprefcnt, int arridx, int flags, char_u *word, slang_T *slang, int cond_req); 333 static void find_prefix(matchinf_T *mip, int mode); 334 static int fold_more(matchinf_T *mip); 335 static int spell_valid_case(int wordflags, int treeflags); 336 static void spell_load_cb(char_u *fname, void *cookie); 337 static int count_syllables(slang_T *slang, char_u *word); 338 static void clear_midword(win_T *buf); 339 static void use_midword(slang_T *lp, win_T *buf); 340 static int find_region(char_u *rp, char_u *region); 341 static int check_need_cap(linenr_T lnum, colnr_T col); 342 static void spell_find_suggest(char_u *badptr, int badlen, suginfo_T *su, int maxcount, int banbadword, int need_cap, int interactive); 343 #ifdef FEAT_EVAL 344 static void spell_suggest_expr(suginfo_T *su, char_u *expr); 345 #endif 346 static void spell_suggest_file(suginfo_T *su, char_u *fname); 347 static void spell_suggest_intern(suginfo_T *su, int interactive); 348 static void spell_find_cleanup(suginfo_T *su); 349 static void suggest_try_special(suginfo_T *su); 350 static void suggest_try_change(suginfo_T *su); 351 static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char_u *fword, int soundfold); 352 static void go_deeper(trystate_T *stack, int depth, int score_add); 353 static int nofold_len(char_u *fword, int flen, char_u *word); 354 static void find_keepcap_word(slang_T *slang, char_u *fword, char_u *kword); 355 static void score_comp_sal(suginfo_T *su); 356 static void score_combine(suginfo_T *su); 357 static int stp_sal_score(suggest_T *stp, suginfo_T *su, slang_T *slang, char_u *badsound); 358 static void suggest_try_soundalike_prep(void); 359 static void suggest_try_soundalike(suginfo_T *su); 360 static void suggest_try_soundalike_finish(void); 361 static void add_sound_suggest(suginfo_T *su, char_u *goodword, int score, langp_T *lp); 362 static int soundfold_find(slang_T *slang, char_u *word); 363 static void make_case_word(char_u *fword, char_u *cword, int flags); 364 static int similar_chars(slang_T *slang, int c1, int c2); 365 static void add_suggestion(suginfo_T *su, garray_T *gap, char_u *goodword, int badlen, int score, int altscore, int had_bonus, slang_T *slang, int maxsf); 366 static void check_suggestions(suginfo_T *su, garray_T *gap); 367 static void add_banned(suginfo_T *su, char_u *word); 368 static void rescore_suggestions(suginfo_T *su); 369 static void rescore_one(suginfo_T *su, suggest_T *stp); 370 static int cleanup_suggestions(garray_T *gap, int maxscore, int keep); 371 static void spell_soundfold_sofo(slang_T *slang, char_u *inword, char_u *res); 372 static void spell_soundfold_sal(slang_T *slang, char_u *inword, char_u *res); 373 static void spell_soundfold_wsal(slang_T *slang, char_u *inword, char_u *res); 374 static int soundalike_score(char_u *goodsound, char_u *badsound); 375 static int spell_edit_score(slang_T *slang, char_u *badword, char_u *goodword); 376 static int spell_edit_score_limit(slang_T *slang, char_u *badword, char_u *goodword, int limit); 377 static int spell_edit_score_limit_w(slang_T *slang, char_u *badword, char_u *goodword, int limit); 378 static void dump_word(slang_T *slang, char_u *word, char_u *pat, int *dir, int round, int flags, linenr_T lnum); 379 static linenr_T dump_prefixes(slang_T *slang, char_u *word, char_u *pat, int *dir, int round, int flags, linenr_T startlnum); 380 381 382 /* Remember what "z?" replaced. */ 383 static char_u *repl_from = NULL; 384 static char_u *repl_to = NULL; 385 386 /* 387 * Main spell-checking function. 388 * "ptr" points to a character that could be the start of a word. 389 * "*attrp" is set to the highlight index for a badly spelled word. For a 390 * non-word or when it's OK it remains unchanged. 391 * This must only be called when 'spelllang' is not empty. 392 * 393 * "capcol" is used to check for a Capitalised word after the end of a 394 * sentence. If it's zero then perform the check. Return the column where to 395 * check next, or -1 when no sentence end was found. If it's NULL then don't 396 * worry. 397 * 398 * Returns the length of the word in bytes, also when it's OK, so that the 399 * caller can skip over the word. 400 */ 401 int 402 spell_check( 403 win_T *wp, /* current window */ 404 char_u *ptr, 405 hlf_T *attrp, 406 int *capcol, /* column to check for Capital */ 407 int docount) /* count good words */ 408 { 409 matchinf_T mi; /* Most things are put in "mi" so that it can 410 be passed to functions quickly. */ 411 int nrlen = 0; /* found a number first */ 412 int c; 413 int wrongcaplen = 0; 414 int lpi; 415 int count_word = docount; 416 417 /* A word never starts at a space or a control character. Return quickly 418 * then, skipping over the character. */ 419 if (*ptr <= ' ') 420 return 1; 421 422 /* Return here when loading language files failed. */ 423 if (wp->w_s->b_langp.ga_len == 0) 424 return 1; 425 426 vim_memset(&mi, 0, sizeof(matchinf_T)); 427 428 /* A number is always OK. Also skip hexadecimal numbers 0xFF99 and 429 * 0X99FF. But always do check spelling to find "3GPP" and "11 430 * julifeest". */ 431 if (*ptr >= '0' && *ptr <= '9') 432 { 433 if (*ptr == '0' && (ptr[1] == 'b' || ptr[1] == 'B')) 434 mi.mi_end = skipbin(ptr + 2); 435 else if (*ptr == '0' && (ptr[1] == 'x' || ptr[1] == 'X')) 436 mi.mi_end = skiphex(ptr + 2); 437 else 438 mi.mi_end = skipdigits(ptr); 439 nrlen = (int)(mi.mi_end - ptr); 440 } 441 442 /* Find the normal end of the word (until the next non-word character). */ 443 mi.mi_word = ptr; 444 mi.mi_fend = ptr; 445 if (spell_iswordp(mi.mi_fend, wp)) 446 { 447 do 448 MB_PTR_ADV(mi.mi_fend); 449 while (*mi.mi_fend != NUL && spell_iswordp(mi.mi_fend, wp)); 450 451 if (capcol != NULL && *capcol == 0 && wp->w_s->b_cap_prog != NULL) 452 { 453 /* Check word starting with capital letter. */ 454 c = PTR2CHAR(ptr); 455 if (!SPELL_ISUPPER(c)) 456 wrongcaplen = (int)(mi.mi_fend - ptr); 457 } 458 } 459 if (capcol != NULL) 460 *capcol = -1; 461 462 /* We always use the characters up to the next non-word character, 463 * also for bad words. */ 464 mi.mi_end = mi.mi_fend; 465 466 /* Check caps type later. */ 467 mi.mi_capflags = 0; 468 mi.mi_cend = NULL; 469 mi.mi_win = wp; 470 471 /* case-fold the word with one non-word character, so that we can check 472 * for the word end. */ 473 if (*mi.mi_fend != NUL) 474 MB_PTR_ADV(mi.mi_fend); 475 476 (void)spell_casefold(ptr, (int)(mi.mi_fend - ptr), mi.mi_fword, 477 MAXWLEN + 1); 478 mi.mi_fwordlen = (int)STRLEN(mi.mi_fword); 479 480 /* The word is bad unless we recognize it. */ 481 mi.mi_result = SP_BAD; 482 mi.mi_result2 = SP_BAD; 483 484 /* 485 * Loop over the languages specified in 'spelllang'. 486 * We check them all, because a word may be matched longer in another 487 * language. 488 */ 489 for (lpi = 0; lpi < wp->w_s->b_langp.ga_len; ++lpi) 490 { 491 mi.mi_lp = LANGP_ENTRY(wp->w_s->b_langp, lpi); 492 493 /* If reloading fails the language is still in the list but everything 494 * has been cleared. */ 495 if (mi.mi_lp->lp_slang->sl_fidxs == NULL) 496 continue; 497 498 /* Check for a matching word in case-folded words. */ 499 find_word(&mi, FIND_FOLDWORD); 500 501 /* Check for a matching word in keep-case words. */ 502 find_word(&mi, FIND_KEEPWORD); 503 504 /* Check for matching prefixes. */ 505 find_prefix(&mi, FIND_FOLDWORD); 506 507 /* For a NOBREAK language, may want to use a word without a following 508 * word as a backup. */ 509 if (mi.mi_lp->lp_slang->sl_nobreak && mi.mi_result == SP_BAD 510 && mi.mi_result2 != SP_BAD) 511 { 512 mi.mi_result = mi.mi_result2; 513 mi.mi_end = mi.mi_end2; 514 } 515 516 /* Count the word in the first language where it's found to be OK. */ 517 if (count_word && mi.mi_result == SP_OK) 518 { 519 count_common_word(mi.mi_lp->lp_slang, ptr, 520 (int)(mi.mi_end - ptr), 1); 521 count_word = FALSE; 522 } 523 } 524 525 if (mi.mi_result != SP_OK) 526 { 527 /* If we found a number skip over it. Allows for "42nd". Do flag 528 * rare and local words, e.g., "3GPP". */ 529 if (nrlen > 0) 530 { 531 if (mi.mi_result == SP_BAD || mi.mi_result == SP_BANNED) 532 return nrlen; 533 } 534 535 /* When we are at a non-word character there is no error, just 536 * skip over the character (try looking for a word after it). */ 537 else if (!spell_iswordp_nmw(ptr, wp)) 538 { 539 if (capcol != NULL && wp->w_s->b_cap_prog != NULL) 540 { 541 regmatch_T regmatch; 542 int r; 543 544 /* Check for end of sentence. */ 545 regmatch.regprog = wp->w_s->b_cap_prog; 546 regmatch.rm_ic = FALSE; 547 r = vim_regexec(®match, ptr, 0); 548 wp->w_s->b_cap_prog = regmatch.regprog; 549 if (r) 550 *capcol = (int)(regmatch.endp[0] - ptr); 551 } 552 553 if (has_mbyte) 554 return (*mb_ptr2len)(ptr); 555 return 1; 556 } 557 else if (mi.mi_end == ptr) 558 /* Always include at least one character. Required for when there 559 * is a mixup in "midword". */ 560 MB_PTR_ADV(mi.mi_end); 561 else if (mi.mi_result == SP_BAD 562 && LANGP_ENTRY(wp->w_s->b_langp, 0)->lp_slang->sl_nobreak) 563 { 564 char_u *p, *fp; 565 int save_result = mi.mi_result; 566 567 /* First language in 'spelllang' is NOBREAK. Find first position 568 * at which any word would be valid. */ 569 mi.mi_lp = LANGP_ENTRY(wp->w_s->b_langp, 0); 570 if (mi.mi_lp->lp_slang->sl_fidxs != NULL) 571 { 572 p = mi.mi_word; 573 fp = mi.mi_fword; 574 for (;;) 575 { 576 MB_PTR_ADV(p); 577 MB_PTR_ADV(fp); 578 if (p >= mi.mi_end) 579 break; 580 mi.mi_compoff = (int)(fp - mi.mi_fword); 581 find_word(&mi, FIND_COMPOUND); 582 if (mi.mi_result != SP_BAD) 583 { 584 mi.mi_end = p; 585 break; 586 } 587 } 588 mi.mi_result = save_result; 589 } 590 } 591 592 if (mi.mi_result == SP_BAD || mi.mi_result == SP_BANNED) 593 *attrp = HLF_SPB; 594 else if (mi.mi_result == SP_RARE) 595 *attrp = HLF_SPR; 596 else 597 *attrp = HLF_SPL; 598 } 599 600 if (wrongcaplen > 0 && (mi.mi_result == SP_OK || mi.mi_result == SP_RARE)) 601 { 602 /* Report SpellCap only when the word isn't badly spelled. */ 603 *attrp = HLF_SPC; 604 return wrongcaplen; 605 } 606 607 return (int)(mi.mi_end - ptr); 608 } 609 610 /* 611 * Check if the word at "mip->mi_word" is in the tree. 612 * When "mode" is FIND_FOLDWORD check in fold-case word tree. 613 * When "mode" is FIND_KEEPWORD check in keep-case word tree. 614 * When "mode" is FIND_PREFIX check for word after prefix in fold-case word 615 * tree. 616 * 617 * For a match mip->mi_result is updated. 618 */ 619 static void 620 find_word(matchinf_T *mip, int mode) 621 { 622 idx_T arridx = 0; 623 int endlen[MAXWLEN]; /* length at possible word endings */ 624 idx_T endidx[MAXWLEN]; /* possible word endings */ 625 int endidxcnt = 0; 626 int len; 627 int wlen = 0; 628 int flen; 629 int c; 630 char_u *ptr; 631 idx_T lo, hi, m; 632 char_u *s; 633 char_u *p; 634 int res = SP_BAD; 635 slang_T *slang = mip->mi_lp->lp_slang; 636 unsigned flags; 637 char_u *byts; 638 idx_T *idxs; 639 int word_ends; 640 int prefix_found; 641 int nobreak_result; 642 643 if (mode == FIND_KEEPWORD || mode == FIND_KEEPCOMPOUND) 644 { 645 /* Check for word with matching case in keep-case tree. */ 646 ptr = mip->mi_word; 647 flen = 9999; /* no case folding, always enough bytes */ 648 byts = slang->sl_kbyts; 649 idxs = slang->sl_kidxs; 650 651 if (mode == FIND_KEEPCOMPOUND) 652 /* Skip over the previously found word(s). */ 653 wlen += mip->mi_compoff; 654 } 655 else 656 { 657 /* Check for case-folded in case-folded tree. */ 658 ptr = mip->mi_fword; 659 flen = mip->mi_fwordlen; /* available case-folded bytes */ 660 byts = slang->sl_fbyts; 661 idxs = slang->sl_fidxs; 662 663 if (mode == FIND_PREFIX) 664 { 665 /* Skip over the prefix. */ 666 wlen = mip->mi_prefixlen; 667 flen -= mip->mi_prefixlen; 668 } 669 else if (mode == FIND_COMPOUND) 670 { 671 /* Skip over the previously found word(s). */ 672 wlen = mip->mi_compoff; 673 flen -= mip->mi_compoff; 674 } 675 676 } 677 678 if (byts == NULL) 679 return; /* array is empty */ 680 681 /* 682 * Repeat advancing in the tree until: 683 * - there is a byte that doesn't match, 684 * - we reach the end of the tree, 685 * - or we reach the end of the line. 686 */ 687 for (;;) 688 { 689 if (flen <= 0 && *mip->mi_fend != NUL) 690 flen = fold_more(mip); 691 692 len = byts[arridx++]; 693 694 /* If the first possible byte is a zero the word could end here. 695 * Remember this index, we first check for the longest word. */ 696 if (byts[arridx] == 0) 697 { 698 if (endidxcnt == MAXWLEN) 699 { 700 /* Must be a corrupted spell file. */ 701 emsg(_(e_format)); 702 return; 703 } 704 endlen[endidxcnt] = wlen; 705 endidx[endidxcnt++] = arridx++; 706 --len; 707 708 /* Skip over the zeros, there can be several flag/region 709 * combinations. */ 710 while (len > 0 && byts[arridx] == 0) 711 { 712 ++arridx; 713 --len; 714 } 715 if (len == 0) 716 break; /* no children, word must end here */ 717 } 718 719 /* Stop looking at end of the line. */ 720 if (ptr[wlen] == NUL) 721 break; 722 723 /* Perform a binary search in the list of accepted bytes. */ 724 c = ptr[wlen]; 725 if (c == TAB) /* <Tab> is handled like <Space> */ 726 c = ' '; 727 lo = arridx; 728 hi = arridx + len - 1; 729 while (lo < hi) 730 { 731 m = (lo + hi) / 2; 732 if (byts[m] > c) 733 hi = m - 1; 734 else if (byts[m] < c) 735 lo = m + 1; 736 else 737 { 738 lo = hi = m; 739 break; 740 } 741 } 742 743 /* Stop if there is no matching byte. */ 744 if (hi < lo || byts[lo] != c) 745 break; 746 747 /* Continue at the child (if there is one). */ 748 arridx = idxs[lo]; 749 ++wlen; 750 --flen; 751 752 /* One space in the good word may stand for several spaces in the 753 * checked word. */ 754 if (c == ' ') 755 { 756 for (;;) 757 { 758 if (flen <= 0 && *mip->mi_fend != NUL) 759 flen = fold_more(mip); 760 if (ptr[wlen] != ' ' && ptr[wlen] != TAB) 761 break; 762 ++wlen; 763 --flen; 764 } 765 } 766 } 767 768 /* 769 * Verify that one of the possible endings is valid. Try the longest 770 * first. 771 */ 772 while (endidxcnt > 0) 773 { 774 --endidxcnt; 775 arridx = endidx[endidxcnt]; 776 wlen = endlen[endidxcnt]; 777 778 if ((*mb_head_off)(ptr, ptr + wlen) > 0) 779 continue; /* not at first byte of character */ 780 if (spell_iswordp(ptr + wlen, mip->mi_win)) 781 { 782 if (slang->sl_compprog == NULL && !slang->sl_nobreak) 783 continue; /* next char is a word character */ 784 word_ends = FALSE; 785 } 786 else 787 word_ends = TRUE; 788 /* The prefix flag is before compound flags. Once a valid prefix flag 789 * has been found we try compound flags. */ 790 prefix_found = FALSE; 791 792 if (mode != FIND_KEEPWORD && has_mbyte) 793 { 794 /* Compute byte length in original word, length may change 795 * when folding case. This can be slow, take a shortcut when the 796 * case-folded word is equal to the keep-case word. */ 797 p = mip->mi_word; 798 if (STRNCMP(ptr, p, wlen) != 0) 799 { 800 for (s = ptr; s < ptr + wlen; MB_PTR_ADV(s)) 801 MB_PTR_ADV(p); 802 wlen = (int)(p - mip->mi_word); 803 } 804 } 805 806 /* Check flags and region. For FIND_PREFIX check the condition and 807 * prefix ID. 808 * Repeat this if there are more flags/region alternatives until there 809 * is a match. */ 810 res = SP_BAD; 811 for (len = byts[arridx - 1]; len > 0 && byts[arridx] == 0; 812 --len, ++arridx) 813 { 814 flags = idxs[arridx]; 815 816 /* For the fold-case tree check that the case of the checked word 817 * matches with what the word in the tree requires. 818 * For keep-case tree the case is always right. For prefixes we 819 * don't bother to check. */ 820 if (mode == FIND_FOLDWORD) 821 { 822 if (mip->mi_cend != mip->mi_word + wlen) 823 { 824 /* mi_capflags was set for a different word length, need 825 * to do it again. */ 826 mip->mi_cend = mip->mi_word + wlen; 827 mip->mi_capflags = captype(mip->mi_word, mip->mi_cend); 828 } 829 830 if (mip->mi_capflags == WF_KEEPCAP 831 || !spell_valid_case(mip->mi_capflags, flags)) 832 continue; 833 } 834 835 /* When mode is FIND_PREFIX the word must support the prefix: 836 * check the prefix ID and the condition. Do that for the list at 837 * mip->mi_prefarridx that find_prefix() filled. */ 838 else if (mode == FIND_PREFIX && !prefix_found) 839 { 840 c = valid_word_prefix(mip->mi_prefcnt, mip->mi_prefarridx, 841 flags, 842 mip->mi_word + mip->mi_cprefixlen, slang, 843 FALSE); 844 if (c == 0) 845 continue; 846 847 /* Use the WF_RARE flag for a rare prefix. */ 848 if (c & WF_RAREPFX) 849 flags |= WF_RARE; 850 prefix_found = TRUE; 851 } 852 853 if (slang->sl_nobreak) 854 { 855 if ((mode == FIND_COMPOUND || mode == FIND_KEEPCOMPOUND) 856 && (flags & WF_BANNED) == 0) 857 { 858 /* NOBREAK: found a valid following word. That's all we 859 * need to know, so return. */ 860 mip->mi_result = SP_OK; 861 break; 862 } 863 } 864 865 else if ((mode == FIND_COMPOUND || mode == FIND_KEEPCOMPOUND 866 || !word_ends)) 867 { 868 /* If there is no compound flag or the word is shorter than 869 * COMPOUNDMIN reject it quickly. 870 * Makes you wonder why someone puts a compound flag on a word 871 * that's too short... Myspell compatibility requires this 872 * anyway. */ 873 if (((unsigned)flags >> 24) == 0 874 || wlen - mip->mi_compoff < slang->sl_compminlen) 875 continue; 876 /* For multi-byte chars check character length against 877 * COMPOUNDMIN. */ 878 if (has_mbyte 879 && slang->sl_compminlen > 0 880 && mb_charlen_len(mip->mi_word + mip->mi_compoff, 881 wlen - mip->mi_compoff) < slang->sl_compminlen) 882 continue; 883 884 /* Limit the number of compound words to COMPOUNDWORDMAX if no 885 * maximum for syllables is specified. */ 886 if (!word_ends && mip->mi_complen + mip->mi_compextra + 2 887 > slang->sl_compmax 888 && slang->sl_compsylmax == MAXWLEN) 889 continue; 890 891 /* Don't allow compounding on a side where an affix was added, 892 * unless COMPOUNDPERMITFLAG was used. */ 893 if (mip->mi_complen > 0 && (flags & WF_NOCOMPBEF)) 894 continue; 895 if (!word_ends && (flags & WF_NOCOMPAFT)) 896 continue; 897 898 /* Quickly check if compounding is possible with this flag. */ 899 if (!byte_in_str(mip->mi_complen == 0 900 ? slang->sl_compstartflags 901 : slang->sl_compallflags, 902 ((unsigned)flags >> 24))) 903 continue; 904 905 /* If there is a match with a CHECKCOMPOUNDPATTERN rule 906 * discard the compound word. */ 907 if (match_checkcompoundpattern(ptr, wlen, &slang->sl_comppat)) 908 continue; 909 910 if (mode == FIND_COMPOUND) 911 { 912 int capflags; 913 914 /* Need to check the caps type of the appended compound 915 * word. */ 916 if (has_mbyte && STRNCMP(ptr, mip->mi_word, 917 mip->mi_compoff) != 0) 918 { 919 /* case folding may have changed the length */ 920 p = mip->mi_word; 921 for (s = ptr; s < ptr + mip->mi_compoff; MB_PTR_ADV(s)) 922 MB_PTR_ADV(p); 923 } 924 else 925 p = mip->mi_word + mip->mi_compoff; 926 capflags = captype(p, mip->mi_word + wlen); 927 if (capflags == WF_KEEPCAP || (capflags == WF_ALLCAP 928 && (flags & WF_FIXCAP) != 0)) 929 continue; 930 931 if (capflags != WF_ALLCAP) 932 { 933 /* When the character before the word is a word 934 * character we do not accept a Onecap word. We do 935 * accept a no-caps word, even when the dictionary 936 * word specifies ONECAP. */ 937 MB_PTR_BACK(mip->mi_word, p); 938 if (spell_iswordp_nmw(p, mip->mi_win) 939 ? capflags == WF_ONECAP 940 : (flags & WF_ONECAP) != 0 941 && capflags != WF_ONECAP) 942 continue; 943 } 944 } 945 946 /* If the word ends the sequence of compound flags of the 947 * words must match with one of the COMPOUNDRULE items and 948 * the number of syllables must not be too large. */ 949 mip->mi_compflags[mip->mi_complen] = ((unsigned)flags >> 24); 950 mip->mi_compflags[mip->mi_complen + 1] = NUL; 951 if (word_ends) 952 { 953 char_u fword[MAXWLEN]; 954 955 if (slang->sl_compsylmax < MAXWLEN) 956 { 957 /* "fword" is only needed for checking syllables. */ 958 if (ptr == mip->mi_word) 959 (void)spell_casefold(ptr, wlen, fword, MAXWLEN); 960 else 961 vim_strncpy(fword, ptr, endlen[endidxcnt]); 962 } 963 if (!can_compound(slang, fword, mip->mi_compflags)) 964 continue; 965 } 966 else if (slang->sl_comprules != NULL 967 && !match_compoundrule(slang, mip->mi_compflags)) 968 /* The compound flags collected so far do not match any 969 * COMPOUNDRULE, discard the compounded word. */ 970 continue; 971 } 972 973 /* Check NEEDCOMPOUND: can't use word without compounding. */ 974 else if (flags & WF_NEEDCOMP) 975 continue; 976 977 nobreak_result = SP_OK; 978 979 if (!word_ends) 980 { 981 int save_result = mip->mi_result; 982 char_u *save_end = mip->mi_end; 983 langp_T *save_lp = mip->mi_lp; 984 int lpi; 985 986 /* Check that a valid word follows. If there is one and we 987 * are compounding, it will set "mi_result", thus we are 988 * always finished here. For NOBREAK we only check that a 989 * valid word follows. 990 * Recursive! */ 991 if (slang->sl_nobreak) 992 mip->mi_result = SP_BAD; 993 994 /* Find following word in case-folded tree. */ 995 mip->mi_compoff = endlen[endidxcnt]; 996 if (has_mbyte && mode == FIND_KEEPWORD) 997 { 998 /* Compute byte length in case-folded word from "wlen": 999 * byte length in keep-case word. Length may change when 1000 * folding case. This can be slow, take a shortcut when 1001 * the case-folded word is equal to the keep-case word. */ 1002 p = mip->mi_fword; 1003 if (STRNCMP(ptr, p, wlen) != 0) 1004 { 1005 for (s = ptr; s < ptr + wlen; MB_PTR_ADV(s)) 1006 MB_PTR_ADV(p); 1007 mip->mi_compoff = (int)(p - mip->mi_fword); 1008 } 1009 } 1010 #if 0 /* Disabled, see below */ 1011 c = mip->mi_compoff; 1012 #endif 1013 ++mip->mi_complen; 1014 if (flags & WF_COMPROOT) 1015 ++mip->mi_compextra; 1016 1017 /* For NOBREAK we need to try all NOBREAK languages, at least 1018 * to find the ".add" file(s). */ 1019 for (lpi = 0; lpi < mip->mi_win->w_s->b_langp.ga_len; ++lpi) 1020 { 1021 if (slang->sl_nobreak) 1022 { 1023 mip->mi_lp = LANGP_ENTRY(mip->mi_win->w_s->b_langp, lpi); 1024 if (mip->mi_lp->lp_slang->sl_fidxs == NULL 1025 || !mip->mi_lp->lp_slang->sl_nobreak) 1026 continue; 1027 } 1028 1029 find_word(mip, FIND_COMPOUND); 1030 1031 /* When NOBREAK any word that matches is OK. Otherwise we 1032 * need to find the longest match, thus try with keep-case 1033 * and prefix too. */ 1034 if (!slang->sl_nobreak || mip->mi_result == SP_BAD) 1035 { 1036 /* Find following word in keep-case tree. */ 1037 mip->mi_compoff = wlen; 1038 find_word(mip, FIND_KEEPCOMPOUND); 1039 1040 #if 0 /* Disabled, a prefix must not appear halfway a compound word, 1041 unless the COMPOUNDPERMITFLAG is used and then it can't be a 1042 postponed prefix. */ 1043 if (!slang->sl_nobreak || mip->mi_result == SP_BAD) 1044 { 1045 /* Check for following word with prefix. */ 1046 mip->mi_compoff = c; 1047 find_prefix(mip, FIND_COMPOUND); 1048 } 1049 #endif 1050 } 1051 1052 if (!slang->sl_nobreak) 1053 break; 1054 } 1055 --mip->mi_complen; 1056 if (flags & WF_COMPROOT) 1057 --mip->mi_compextra; 1058 mip->mi_lp = save_lp; 1059 1060 if (slang->sl_nobreak) 1061 { 1062 nobreak_result = mip->mi_result; 1063 mip->mi_result = save_result; 1064 mip->mi_end = save_end; 1065 } 1066 else 1067 { 1068 if (mip->mi_result == SP_OK) 1069 break; 1070 continue; 1071 } 1072 } 1073 1074 if (flags & WF_BANNED) 1075 res = SP_BANNED; 1076 else if (flags & WF_REGION) 1077 { 1078 /* Check region. */ 1079 if ((mip->mi_lp->lp_region & (flags >> 16)) != 0) 1080 res = SP_OK; 1081 else 1082 res = SP_LOCAL; 1083 } 1084 else if (flags & WF_RARE) 1085 res = SP_RARE; 1086 else 1087 res = SP_OK; 1088 1089 /* Always use the longest match and the best result. For NOBREAK 1090 * we separately keep the longest match without a following good 1091 * word as a fall-back. */ 1092 if (nobreak_result == SP_BAD) 1093 { 1094 if (mip->mi_result2 > res) 1095 { 1096 mip->mi_result2 = res; 1097 mip->mi_end2 = mip->mi_word + wlen; 1098 } 1099 else if (mip->mi_result2 == res 1100 && mip->mi_end2 < mip->mi_word + wlen) 1101 mip->mi_end2 = mip->mi_word + wlen; 1102 } 1103 else if (mip->mi_result > res) 1104 { 1105 mip->mi_result = res; 1106 mip->mi_end = mip->mi_word + wlen; 1107 } 1108 else if (mip->mi_result == res && mip->mi_end < mip->mi_word + wlen) 1109 mip->mi_end = mip->mi_word + wlen; 1110 1111 if (mip->mi_result == SP_OK) 1112 break; 1113 } 1114 1115 if (mip->mi_result == SP_OK) 1116 break; 1117 } 1118 } 1119 1120 /* 1121 * Return TRUE if there is a match between the word ptr[wlen] and 1122 * CHECKCOMPOUNDPATTERN rules, assuming that we will concatenate with another 1123 * word. 1124 * A match means that the first part of CHECKCOMPOUNDPATTERN matches at the 1125 * end of ptr[wlen] and the second part matches after it. 1126 */ 1127 static int 1128 match_checkcompoundpattern( 1129 char_u *ptr, 1130 int wlen, 1131 garray_T *gap) /* &sl_comppat */ 1132 { 1133 int i; 1134 char_u *p; 1135 int len; 1136 1137 for (i = 0; i + 1 < gap->ga_len; i += 2) 1138 { 1139 p = ((char_u **)gap->ga_data)[i + 1]; 1140 if (STRNCMP(ptr + wlen, p, STRLEN(p)) == 0) 1141 { 1142 /* Second part matches at start of following compound word, now 1143 * check if first part matches at end of previous word. */ 1144 p = ((char_u **)gap->ga_data)[i]; 1145 len = (int)STRLEN(p); 1146 if (len <= wlen && STRNCMP(ptr + wlen - len, p, len) == 0) 1147 return TRUE; 1148 } 1149 } 1150 return FALSE; 1151 } 1152 1153 /* 1154 * Return TRUE if "flags" is a valid sequence of compound flags and "word" 1155 * does not have too many syllables. 1156 */ 1157 static int 1158 can_compound(slang_T *slang, char_u *word, char_u *flags) 1159 { 1160 char_u uflags[MAXWLEN * 2]; 1161 int i; 1162 char_u *p; 1163 1164 if (slang->sl_compprog == NULL) 1165 return FALSE; 1166 if (enc_utf8) 1167 { 1168 /* Need to convert the single byte flags to utf8 characters. */ 1169 p = uflags; 1170 for (i = 0; flags[i] != NUL; ++i) 1171 p += utf_char2bytes(flags[i], p); 1172 *p = NUL; 1173 p = uflags; 1174 } 1175 else 1176 p = flags; 1177 if (!vim_regexec_prog(&slang->sl_compprog, FALSE, p, 0)) 1178 return FALSE; 1179 1180 /* Count the number of syllables. This may be slow, do it last. If there 1181 * are too many syllables AND the number of compound words is above 1182 * COMPOUNDWORDMAX then compounding is not allowed. */ 1183 if (slang->sl_compsylmax < MAXWLEN 1184 && count_syllables(slang, word) > slang->sl_compsylmax) 1185 return (int)STRLEN(flags) < slang->sl_compmax; 1186 return TRUE; 1187 } 1188 1189 /* 1190 * Return TRUE when the sequence of flags in "compflags" plus "flag" can 1191 * possibly form a valid compounded word. This also checks the COMPOUNDRULE 1192 * lines if they don't contain wildcards. 1193 */ 1194 static int 1195 can_be_compound( 1196 trystate_T *sp, 1197 slang_T *slang, 1198 char_u *compflags, 1199 int flag) 1200 { 1201 /* If the flag doesn't appear in sl_compstartflags or sl_compallflags 1202 * then it can't possibly compound. */ 1203 if (!byte_in_str(sp->ts_complen == sp->ts_compsplit 1204 ? slang->sl_compstartflags : slang->sl_compallflags, flag)) 1205 return FALSE; 1206 1207 /* If there are no wildcards, we can check if the flags collected so far 1208 * possibly can form a match with COMPOUNDRULE patterns. This only 1209 * makes sense when we have two or more words. */ 1210 if (slang->sl_comprules != NULL && sp->ts_complen > sp->ts_compsplit) 1211 { 1212 int v; 1213 1214 compflags[sp->ts_complen] = flag; 1215 compflags[sp->ts_complen + 1] = NUL; 1216 v = match_compoundrule(slang, compflags + sp->ts_compsplit); 1217 compflags[sp->ts_complen] = NUL; 1218 return v; 1219 } 1220 1221 return TRUE; 1222 } 1223 1224 1225 /* 1226 * Return TRUE if the compound flags in compflags[] match the start of any 1227 * compound rule. This is used to stop trying a compound if the flags 1228 * collected so far can't possibly match any compound rule. 1229 * Caller must check that slang->sl_comprules is not NULL. 1230 */ 1231 static int 1232 match_compoundrule(slang_T *slang, char_u *compflags) 1233 { 1234 char_u *p; 1235 int i; 1236 int c; 1237 1238 /* loop over all the COMPOUNDRULE entries */ 1239 for (p = slang->sl_comprules; *p != NUL; ++p) 1240 { 1241 /* loop over the flags in the compound word we have made, match 1242 * them against the current rule entry */ 1243 for (i = 0; ; ++i) 1244 { 1245 c = compflags[i]; 1246 if (c == NUL) 1247 /* found a rule that matches for the flags we have so far */ 1248 return TRUE; 1249 if (*p == '/' || *p == NUL) 1250 break; /* end of rule, it's too short */ 1251 if (*p == '[') 1252 { 1253 int match = FALSE; 1254 1255 /* compare against all the flags in [] */ 1256 ++p; 1257 while (*p != ']' && *p != NUL) 1258 if (*p++ == c) 1259 match = TRUE; 1260 if (!match) 1261 break; /* none matches */ 1262 } 1263 else if (*p != c) 1264 break; /* flag of word doesn't match flag in pattern */ 1265 ++p; 1266 } 1267 1268 /* Skip to the next "/", where the next pattern starts. */ 1269 p = vim_strchr(p, '/'); 1270 if (p == NULL) 1271 break; 1272 } 1273 1274 /* Checked all the rules and none of them match the flags, so there 1275 * can't possibly be a compound starting with these flags. */ 1276 return FALSE; 1277 } 1278 1279 /* 1280 * Return non-zero if the prefix indicated by "arridx" matches with the prefix 1281 * ID in "flags" for the word "word". 1282 * The WF_RAREPFX flag is included in the return value for a rare prefix. 1283 */ 1284 static int 1285 valid_word_prefix( 1286 int totprefcnt, /* nr of prefix IDs */ 1287 int arridx, /* idx in sl_pidxs[] */ 1288 int flags, 1289 char_u *word, 1290 slang_T *slang, 1291 int cond_req) /* only use prefixes with a condition */ 1292 { 1293 int prefcnt; 1294 int pidx; 1295 regprog_T **rp; 1296 int prefid; 1297 1298 prefid = (unsigned)flags >> 24; 1299 for (prefcnt = totprefcnt - 1; prefcnt >= 0; --prefcnt) 1300 { 1301 pidx = slang->sl_pidxs[arridx + prefcnt]; 1302 1303 /* Check the prefix ID. */ 1304 if (prefid != (pidx & 0xff)) 1305 continue; 1306 1307 /* Check if the prefix doesn't combine and the word already has a 1308 * suffix. */ 1309 if ((flags & WF_HAS_AFF) && (pidx & WF_PFX_NC)) 1310 continue; 1311 1312 /* Check the condition, if there is one. The condition index is 1313 * stored in the two bytes above the prefix ID byte. */ 1314 rp = &slang->sl_prefprog[((unsigned)pidx >> 8) & 0xffff]; 1315 if (*rp != NULL) 1316 { 1317 if (!vim_regexec_prog(rp, FALSE, word, 0)) 1318 continue; 1319 } 1320 else if (cond_req) 1321 continue; 1322 1323 /* It's a match! Return the WF_ flags. */ 1324 return pidx; 1325 } 1326 return 0; 1327 } 1328 1329 /* 1330 * Check if the word at "mip->mi_word" has a matching prefix. 1331 * If it does, then check the following word. 1332 * 1333 * If "mode" is "FIND_COMPOUND" then do the same after another word, find a 1334 * prefix in a compound word. 1335 * 1336 * For a match mip->mi_result is updated. 1337 */ 1338 static void 1339 find_prefix(matchinf_T *mip, int mode) 1340 { 1341 idx_T arridx = 0; 1342 int len; 1343 int wlen = 0; 1344 int flen; 1345 int c; 1346 char_u *ptr; 1347 idx_T lo, hi, m; 1348 slang_T *slang = mip->mi_lp->lp_slang; 1349 char_u *byts; 1350 idx_T *idxs; 1351 1352 byts = slang->sl_pbyts; 1353 if (byts == NULL) 1354 return; /* array is empty */ 1355 1356 /* We use the case-folded word here, since prefixes are always 1357 * case-folded. */ 1358 ptr = mip->mi_fword; 1359 flen = mip->mi_fwordlen; /* available case-folded bytes */ 1360 if (mode == FIND_COMPOUND) 1361 { 1362 /* Skip over the previously found word(s). */ 1363 ptr += mip->mi_compoff; 1364 flen -= mip->mi_compoff; 1365 } 1366 idxs = slang->sl_pidxs; 1367 1368 /* 1369 * Repeat advancing in the tree until: 1370 * - there is a byte that doesn't match, 1371 * - we reach the end of the tree, 1372 * - or we reach the end of the line. 1373 */ 1374 for (;;) 1375 { 1376 if (flen == 0 && *mip->mi_fend != NUL) 1377 flen = fold_more(mip); 1378 1379 len = byts[arridx++]; 1380 1381 /* If the first possible byte is a zero the prefix could end here. 1382 * Check if the following word matches and supports the prefix. */ 1383 if (byts[arridx] == 0) 1384 { 1385 /* There can be several prefixes with different conditions. We 1386 * try them all, since we don't know which one will give the 1387 * longest match. The word is the same each time, pass the list 1388 * of possible prefixes to find_word(). */ 1389 mip->mi_prefarridx = arridx; 1390 mip->mi_prefcnt = len; 1391 while (len > 0 && byts[arridx] == 0) 1392 { 1393 ++arridx; 1394 --len; 1395 } 1396 mip->mi_prefcnt -= len; 1397 1398 /* Find the word that comes after the prefix. */ 1399 mip->mi_prefixlen = wlen; 1400 if (mode == FIND_COMPOUND) 1401 /* Skip over the previously found word(s). */ 1402 mip->mi_prefixlen += mip->mi_compoff; 1403 1404 if (has_mbyte) 1405 { 1406 /* Case-folded length may differ from original length. */ 1407 mip->mi_cprefixlen = nofold_len(mip->mi_fword, 1408 mip->mi_prefixlen, mip->mi_word); 1409 } 1410 else 1411 mip->mi_cprefixlen = mip->mi_prefixlen; 1412 find_word(mip, FIND_PREFIX); 1413 1414 1415 if (len == 0) 1416 break; /* no children, word must end here */ 1417 } 1418 1419 /* Stop looking at end of the line. */ 1420 if (ptr[wlen] == NUL) 1421 break; 1422 1423 /* Perform a binary search in the list of accepted bytes. */ 1424 c = ptr[wlen]; 1425 lo = arridx; 1426 hi = arridx + len - 1; 1427 while (lo < hi) 1428 { 1429 m = (lo + hi) / 2; 1430 if (byts[m] > c) 1431 hi = m - 1; 1432 else if (byts[m] < c) 1433 lo = m + 1; 1434 else 1435 { 1436 lo = hi = m; 1437 break; 1438 } 1439 } 1440 1441 /* Stop if there is no matching byte. */ 1442 if (hi < lo || byts[lo] != c) 1443 break; 1444 1445 /* Continue at the child (if there is one). */ 1446 arridx = idxs[lo]; 1447 ++wlen; 1448 --flen; 1449 } 1450 } 1451 1452 /* 1453 * Need to fold at least one more character. Do until next non-word character 1454 * for efficiency. Include the non-word character too. 1455 * Return the length of the folded chars in bytes. 1456 */ 1457 static int 1458 fold_more(matchinf_T *mip) 1459 { 1460 int flen; 1461 char_u *p; 1462 1463 p = mip->mi_fend; 1464 do 1465 MB_PTR_ADV(mip->mi_fend); 1466 while (*mip->mi_fend != NUL && spell_iswordp(mip->mi_fend, mip->mi_win)); 1467 1468 /* Include the non-word character so that we can check for the word end. */ 1469 if (*mip->mi_fend != NUL) 1470 MB_PTR_ADV(mip->mi_fend); 1471 1472 (void)spell_casefold(p, (int)(mip->mi_fend - p), 1473 mip->mi_fword + mip->mi_fwordlen, 1474 MAXWLEN - mip->mi_fwordlen); 1475 flen = (int)STRLEN(mip->mi_fword + mip->mi_fwordlen); 1476 mip->mi_fwordlen += flen; 1477 return flen; 1478 } 1479 1480 /* 1481 * Check case flags for a word. Return TRUE if the word has the requested 1482 * case. 1483 */ 1484 static int 1485 spell_valid_case( 1486 int wordflags, /* flags for the checked word. */ 1487 int treeflags) /* flags for the word in the spell tree */ 1488 { 1489 return ((wordflags == WF_ALLCAP && (treeflags & WF_FIXCAP) == 0) 1490 || ((treeflags & (WF_ALLCAP | WF_KEEPCAP)) == 0 1491 && ((treeflags & WF_ONECAP) == 0 1492 || (wordflags & WF_ONECAP) != 0))); 1493 } 1494 1495 /* 1496 * Return TRUE if spell checking is not enabled. 1497 */ 1498 static int 1499 no_spell_checking(win_T *wp) 1500 { 1501 if (!wp->w_p_spell || *wp->w_s->b_p_spl == NUL 1502 || wp->w_s->b_langp.ga_len == 0) 1503 { 1504 emsg(_("E756: Spell checking is not enabled")); 1505 return TRUE; 1506 } 1507 return FALSE; 1508 } 1509 1510 /* 1511 * Move to next spell error. 1512 * "curline" is FALSE for "[s", "]s", "[S" and "]S". 1513 * "curline" is TRUE to find word under/after cursor in the same line. 1514 * For Insert mode completion "dir" is BACKWARD and "curline" is TRUE: move 1515 * to after badly spelled word before the cursor. 1516 * Return 0 if not found, length of the badly spelled word otherwise. 1517 */ 1518 int 1519 spell_move_to( 1520 win_T *wp, 1521 int dir, /* FORWARD or BACKWARD */ 1522 int allwords, /* TRUE for "[s"/"]s", FALSE for "[S"/"]S" */ 1523 int curline, 1524 hlf_T *attrp) /* return: attributes of bad word or NULL 1525 (only when "dir" is FORWARD) */ 1526 { 1527 linenr_T lnum; 1528 pos_T found_pos; 1529 int found_len = 0; 1530 char_u *line; 1531 char_u *p; 1532 char_u *endp; 1533 hlf_T attr; 1534 int len; 1535 #ifdef FEAT_SYN_HL 1536 int has_syntax = syntax_present(wp); 1537 #endif 1538 int col; 1539 int can_spell; 1540 char_u *buf = NULL; 1541 int buflen = 0; 1542 int skip = 0; 1543 int capcol = -1; 1544 int found_one = FALSE; 1545 int wrapped = FALSE; 1546 1547 if (no_spell_checking(wp)) 1548 return 0; 1549 1550 /* 1551 * Start looking for bad word at the start of the line, because we can't 1552 * start halfway a word, we don't know where it starts or ends. 1553 * 1554 * When searching backwards, we continue in the line to find the last 1555 * bad word (in the cursor line: before the cursor). 1556 * 1557 * We concatenate the start of the next line, so that wrapped words work 1558 * (e.g. "et<line-break>cetera"). Doesn't work when searching backwards 1559 * though... 1560 */ 1561 lnum = wp->w_cursor.lnum; 1562 CLEAR_POS(&found_pos); 1563 1564 while (!got_int) 1565 { 1566 line = ml_get_buf(wp->w_buffer, lnum, FALSE); 1567 1568 len = (int)STRLEN(line); 1569 if (buflen < len + MAXWLEN + 2) 1570 { 1571 vim_free(buf); 1572 buflen = len + MAXWLEN + 2; 1573 buf = alloc(buflen); 1574 if (buf == NULL) 1575 break; 1576 } 1577 1578 /* In first line check first word for Capital. */ 1579 if (lnum == 1) 1580 capcol = 0; 1581 1582 /* For checking first word with a capital skip white space. */ 1583 if (capcol == 0) 1584 capcol = getwhitecols(line); 1585 else if (curline && wp == curwin) 1586 { 1587 /* For spellbadword(): check if first word needs a capital. */ 1588 col = getwhitecols(line); 1589 if (check_need_cap(lnum, col)) 1590 capcol = col; 1591 1592 /* Need to get the line again, may have looked at the previous 1593 * one. */ 1594 line = ml_get_buf(wp->w_buffer, lnum, FALSE); 1595 } 1596 1597 /* Copy the line into "buf" and append the start of the next line if 1598 * possible. */ 1599 STRCPY(buf, line); 1600 if (lnum < wp->w_buffer->b_ml.ml_line_count) 1601 spell_cat_line(buf + STRLEN(buf), 1602 ml_get_buf(wp->w_buffer, lnum + 1, FALSE), MAXWLEN); 1603 1604 p = buf + skip; 1605 endp = buf + len; 1606 while (p < endp) 1607 { 1608 /* When searching backward don't search after the cursor. Unless 1609 * we wrapped around the end of the buffer. */ 1610 if (dir == BACKWARD 1611 && lnum == wp->w_cursor.lnum 1612 && !wrapped 1613 && (colnr_T)(p - buf) >= wp->w_cursor.col) 1614 break; 1615 1616 /* start of word */ 1617 attr = HLF_COUNT; 1618 len = spell_check(wp, p, &attr, &capcol, FALSE); 1619 1620 if (attr != HLF_COUNT) 1621 { 1622 /* We found a bad word. Check the attribute. */ 1623 if (allwords || attr == HLF_SPB) 1624 { 1625 /* When searching forward only accept a bad word after 1626 * the cursor. */ 1627 if (dir == BACKWARD 1628 || lnum != wp->w_cursor.lnum 1629 || (lnum == wp->w_cursor.lnum 1630 && (wrapped 1631 || (colnr_T)(curline ? p - buf + len 1632 : p - buf) 1633 > wp->w_cursor.col))) 1634 { 1635 #ifdef FEAT_SYN_HL 1636 if (has_syntax) 1637 { 1638 col = (int)(p - buf); 1639 (void)syn_get_id(wp, lnum, (colnr_T)col, 1640 FALSE, &can_spell, FALSE); 1641 if (!can_spell) 1642 attr = HLF_COUNT; 1643 } 1644 else 1645 #endif 1646 can_spell = TRUE; 1647 1648 if (can_spell) 1649 { 1650 found_one = TRUE; 1651 found_pos.lnum = lnum; 1652 found_pos.col = (int)(p - buf); 1653 found_pos.coladd = 0; 1654 if (dir == FORWARD) 1655 { 1656 /* No need to search further. */ 1657 wp->w_cursor = found_pos; 1658 vim_free(buf); 1659 if (attrp != NULL) 1660 *attrp = attr; 1661 return len; 1662 } 1663 else if (curline) 1664 /* Insert mode completion: put cursor after 1665 * the bad word. */ 1666 found_pos.col += len; 1667 found_len = len; 1668 } 1669 } 1670 else 1671 found_one = TRUE; 1672 } 1673 } 1674 1675 /* advance to character after the word */ 1676 p += len; 1677 capcol -= len; 1678 } 1679 1680 if (dir == BACKWARD && found_pos.lnum != 0) 1681 { 1682 /* Use the last match in the line (before the cursor). */ 1683 wp->w_cursor = found_pos; 1684 vim_free(buf); 1685 return found_len; 1686 } 1687 1688 if (curline) 1689 break; /* only check cursor line */ 1690 1691 /* If we are back at the starting line and searched it again there 1692 * is no match, give up. */ 1693 if (lnum == wp->w_cursor.lnum && wrapped) 1694 break; 1695 1696 /* Advance to next line. */ 1697 if (dir == BACKWARD) 1698 { 1699 if (lnum > 1) 1700 --lnum; 1701 else if (!p_ws) 1702 break; /* at first line and 'nowrapscan' */ 1703 else 1704 { 1705 /* Wrap around to the end of the buffer. May search the 1706 * starting line again and accept the last match. */ 1707 lnum = wp->w_buffer->b_ml.ml_line_count; 1708 wrapped = TRUE; 1709 if (!shortmess(SHM_SEARCH)) 1710 give_warning((char_u *)_(top_bot_msg), TRUE); 1711 } 1712 capcol = -1; 1713 } 1714 else 1715 { 1716 if (lnum < wp->w_buffer->b_ml.ml_line_count) 1717 ++lnum; 1718 else if (!p_ws) 1719 break; /* at first line and 'nowrapscan' */ 1720 else 1721 { 1722 /* Wrap around to the start of the buffer. May search the 1723 * starting line again and accept the first match. */ 1724 lnum = 1; 1725 wrapped = TRUE; 1726 if (!shortmess(SHM_SEARCH)) 1727 give_warning((char_u *)_(bot_top_msg), TRUE); 1728 } 1729 1730 /* If we are back at the starting line and there is no match then 1731 * give up. */ 1732 if (lnum == wp->w_cursor.lnum && !found_one) 1733 break; 1734 1735 /* Skip the characters at the start of the next line that were 1736 * included in a match crossing line boundaries. */ 1737 if (attr == HLF_COUNT) 1738 skip = (int)(p - endp); 1739 else 1740 skip = 0; 1741 1742 /* Capcol skips over the inserted space. */ 1743 --capcol; 1744 1745 /* But after empty line check first word in next line */ 1746 if (*skipwhite(line) == NUL) 1747 capcol = 0; 1748 } 1749 1750 line_breakcheck(); 1751 } 1752 1753 vim_free(buf); 1754 return 0; 1755 } 1756 1757 /* 1758 * For spell checking: concatenate the start of the following line "line" into 1759 * "buf", blanking-out special characters. Copy less then "maxlen" bytes. 1760 * Keep the blanks at the start of the next line, this is used in win_line() 1761 * to skip those bytes if the word was OK. 1762 */ 1763 void 1764 spell_cat_line(char_u *buf, char_u *line, int maxlen) 1765 { 1766 char_u *p; 1767 int n; 1768 1769 p = skipwhite(line); 1770 while (vim_strchr((char_u *)"*#/\"\t", *p) != NULL) 1771 p = skipwhite(p + 1); 1772 1773 if (*p != NUL) 1774 { 1775 /* Only worth concatenating if there is something else than spaces to 1776 * concatenate. */ 1777 n = (int)(p - line) + 1; 1778 if (n < maxlen - 1) 1779 { 1780 vim_memset(buf, ' ', n); 1781 vim_strncpy(buf + n, p, maxlen - 1 - n); 1782 } 1783 } 1784 } 1785 1786 /* 1787 * Structure used for the cookie argument of do_in_runtimepath(). 1788 */ 1789 typedef struct spelload_S 1790 { 1791 char_u sl_lang[MAXWLEN + 1]; /* language name */ 1792 slang_T *sl_slang; /* resulting slang_T struct */ 1793 int sl_nobreak; /* NOBREAK language found */ 1794 } spelload_T; 1795 1796 /* 1797 * Load word list(s) for "lang" from Vim spell file(s). 1798 * "lang" must be the language without the region: e.g., "en". 1799 */ 1800 static void 1801 spell_load_lang(char_u *lang) 1802 { 1803 char_u fname_enc[85]; 1804 int r; 1805 spelload_T sl; 1806 int round; 1807 1808 /* Copy the language name to pass it to spell_load_cb() as a cookie. 1809 * It's truncated when an error is detected. */ 1810 STRCPY(sl.sl_lang, lang); 1811 sl.sl_slang = NULL; 1812 sl.sl_nobreak = FALSE; 1813 1814 /* We may retry when no spell file is found for the language, an 1815 * autocommand may load it then. */ 1816 for (round = 1; round <= 2; ++round) 1817 { 1818 /* 1819 * Find the first spell file for "lang" in 'runtimepath' and load it. 1820 */ 1821 vim_snprintf((char *)fname_enc, sizeof(fname_enc) - 5, 1822 #ifdef VMS 1823 "spell/%s_%s.spl", 1824 #else 1825 "spell/%s.%s.spl", 1826 #endif 1827 lang, spell_enc()); 1828 r = do_in_runtimepath(fname_enc, 0, spell_load_cb, &sl); 1829 1830 if (r == FAIL && *sl.sl_lang != NUL) 1831 { 1832 /* Try loading the ASCII version. */ 1833 vim_snprintf((char *)fname_enc, sizeof(fname_enc) - 5, 1834 #ifdef VMS 1835 "spell/%s_ascii.spl", 1836 #else 1837 "spell/%s.ascii.spl", 1838 #endif 1839 lang); 1840 r = do_in_runtimepath(fname_enc, 0, spell_load_cb, &sl); 1841 1842 if (r == FAIL && *sl.sl_lang != NUL && round == 1 1843 && apply_autocmds(EVENT_SPELLFILEMISSING, lang, 1844 curbuf->b_fname, FALSE, curbuf)) 1845 continue; 1846 break; 1847 } 1848 break; 1849 } 1850 1851 if (r == FAIL) 1852 { 1853 smsg( 1854 #ifdef VMS 1855 _("Warning: Cannot find word list \"%s_%s.spl\" or \"%s_ascii.spl\""), 1856 #else 1857 _("Warning: Cannot find word list \"%s.%s.spl\" or \"%s.ascii.spl\""), 1858 #endif 1859 lang, spell_enc(), lang); 1860 } 1861 else if (sl.sl_slang != NULL) 1862 { 1863 /* At least one file was loaded, now load ALL the additions. */ 1864 STRCPY(fname_enc + STRLEN(fname_enc) - 3, "add.spl"); 1865 do_in_runtimepath(fname_enc, DIP_ALL, spell_load_cb, &sl); 1866 } 1867 } 1868 1869 /* 1870 * Return the encoding used for spell checking: Use 'encoding', except that we 1871 * use "latin1" for "latin9". And limit to 60 characters (just in case). 1872 */ 1873 char_u * 1874 spell_enc(void) 1875 { 1876 1877 if (STRLEN(p_enc) < 60 && STRCMP(p_enc, "iso-8859-15") != 0) 1878 return p_enc; 1879 return (char_u *)"latin1"; 1880 } 1881 1882 /* 1883 * Get the name of the .spl file for the internal wordlist into 1884 * "fname[MAXPATHL]". 1885 */ 1886 static void 1887 int_wordlist_spl(char_u *fname) 1888 { 1889 vim_snprintf((char *)fname, MAXPATHL, SPL_FNAME_TMPL, 1890 int_wordlist, spell_enc()); 1891 } 1892 1893 /* 1894 * Allocate a new slang_T for language "lang". "lang" can be NULL. 1895 * Caller must fill "sl_next". 1896 */ 1897 slang_T * 1898 slang_alloc(char_u *lang) 1899 { 1900 slang_T *lp; 1901 1902 lp = (slang_T *)alloc_clear(sizeof(slang_T)); 1903 if (lp != NULL) 1904 { 1905 if (lang != NULL) 1906 lp->sl_name = vim_strsave(lang); 1907 ga_init2(&lp->sl_rep, sizeof(fromto_T), 10); 1908 ga_init2(&lp->sl_repsal, sizeof(fromto_T), 10); 1909 lp->sl_compmax = MAXWLEN; 1910 lp->sl_compsylmax = MAXWLEN; 1911 hash_init(&lp->sl_wordcount); 1912 } 1913 1914 return lp; 1915 } 1916 1917 /* 1918 * Free the contents of an slang_T and the structure itself. 1919 */ 1920 void 1921 slang_free(slang_T *lp) 1922 { 1923 vim_free(lp->sl_name); 1924 vim_free(lp->sl_fname); 1925 slang_clear(lp); 1926 vim_free(lp); 1927 } 1928 1929 /* 1930 * Clear an slang_T so that the file can be reloaded. 1931 */ 1932 void 1933 slang_clear(slang_T *lp) 1934 { 1935 garray_T *gap; 1936 fromto_T *ftp; 1937 salitem_T *smp; 1938 int i; 1939 int round; 1940 1941 VIM_CLEAR(lp->sl_fbyts); 1942 VIM_CLEAR(lp->sl_kbyts); 1943 VIM_CLEAR(lp->sl_pbyts); 1944 1945 VIM_CLEAR(lp->sl_fidxs); 1946 VIM_CLEAR(lp->sl_kidxs); 1947 VIM_CLEAR(lp->sl_pidxs); 1948 1949 for (round = 1; round <= 2; ++round) 1950 { 1951 gap = round == 1 ? &lp->sl_rep : &lp->sl_repsal; 1952 while (gap->ga_len > 0) 1953 { 1954 ftp = &((fromto_T *)gap->ga_data)[--gap->ga_len]; 1955 vim_free(ftp->ft_from); 1956 vim_free(ftp->ft_to); 1957 } 1958 ga_clear(gap); 1959 } 1960 1961 gap = &lp->sl_sal; 1962 if (lp->sl_sofo) 1963 { 1964 /* "ga_len" is set to 1 without adding an item for latin1 */ 1965 if (gap->ga_data != NULL) 1966 /* SOFOFROM and SOFOTO items: free lists of wide characters. */ 1967 for (i = 0; i < gap->ga_len; ++i) 1968 vim_free(((int **)gap->ga_data)[i]); 1969 } 1970 else 1971 /* SAL items: free salitem_T items */ 1972 while (gap->ga_len > 0) 1973 { 1974 smp = &((salitem_T *)gap->ga_data)[--gap->ga_len]; 1975 vim_free(smp->sm_lead); 1976 /* Don't free sm_oneof and sm_rules, they point into sm_lead. */ 1977 vim_free(smp->sm_to); 1978 vim_free(smp->sm_lead_w); 1979 vim_free(smp->sm_oneof_w); 1980 vim_free(smp->sm_to_w); 1981 } 1982 ga_clear(gap); 1983 1984 for (i = 0; i < lp->sl_prefixcnt; ++i) 1985 vim_regfree(lp->sl_prefprog[i]); 1986 lp->sl_prefixcnt = 0; 1987 VIM_CLEAR(lp->sl_prefprog); 1988 1989 VIM_CLEAR(lp->sl_info); 1990 1991 VIM_CLEAR(lp->sl_midword); 1992 1993 vim_regfree(lp->sl_compprog); 1994 lp->sl_compprog = NULL; 1995 VIM_CLEAR(lp->sl_comprules); 1996 VIM_CLEAR(lp->sl_compstartflags); 1997 VIM_CLEAR(lp->sl_compallflags); 1998 1999 VIM_CLEAR(lp->sl_syllable); 2000 ga_clear(&lp->sl_syl_items); 2001 2002 ga_clear_strings(&lp->sl_comppat); 2003 2004 hash_clear_all(&lp->sl_wordcount, WC_KEY_OFF); 2005 hash_init(&lp->sl_wordcount); 2006 2007 hash_clear_all(&lp->sl_map_hash, 0); 2008 2009 /* Clear info from .sug file. */ 2010 slang_clear_sug(lp); 2011 2012 lp->sl_compmax = MAXWLEN; 2013 lp->sl_compminlen = 0; 2014 lp->sl_compsylmax = MAXWLEN; 2015 lp->sl_regions[0] = NUL; 2016 } 2017 2018 /* 2019 * Clear the info from the .sug file in "lp". 2020 */ 2021 void 2022 slang_clear_sug(slang_T *lp) 2023 { 2024 VIM_CLEAR(lp->sl_sbyts); 2025 VIM_CLEAR(lp->sl_sidxs); 2026 close_spellbuf(lp->sl_sugbuf); 2027 lp->sl_sugbuf = NULL; 2028 lp->sl_sugloaded = FALSE; 2029 lp->sl_sugtime = 0; 2030 } 2031 2032 /* 2033 * Load one spell file and store the info into a slang_T. 2034 * Invoked through do_in_runtimepath(). 2035 */ 2036 static void 2037 spell_load_cb(char_u *fname, void *cookie) 2038 { 2039 spelload_T *slp = (spelload_T *)cookie; 2040 slang_T *slang; 2041 2042 slang = spell_load_file(fname, slp->sl_lang, NULL, FALSE); 2043 if (slang != NULL) 2044 { 2045 /* When a previously loaded file has NOBREAK also use it for the 2046 * ".add" files. */ 2047 if (slp->sl_nobreak && slang->sl_add) 2048 slang->sl_nobreak = TRUE; 2049 else if (slang->sl_nobreak) 2050 slp->sl_nobreak = TRUE; 2051 2052 slp->sl_slang = slang; 2053 } 2054 } 2055 2056 2057 /* 2058 * Add a word to the hashtable of common words. 2059 * If it's already there then the counter is increased. 2060 */ 2061 void 2062 count_common_word( 2063 slang_T *lp, 2064 char_u *word, 2065 int len, /* word length, -1 for upto NUL */ 2066 int count) /* 1 to count once, 10 to init */ 2067 { 2068 hash_T hash; 2069 hashitem_T *hi; 2070 wordcount_T *wc; 2071 char_u buf[MAXWLEN]; 2072 char_u *p; 2073 2074 if (len == -1) 2075 p = word; 2076 else 2077 { 2078 vim_strncpy(buf, word, len); 2079 p = buf; 2080 } 2081 2082 hash = hash_hash(p); 2083 hi = hash_lookup(&lp->sl_wordcount, p, hash); 2084 if (HASHITEM_EMPTY(hi)) 2085 { 2086 wc = (wordcount_T *)alloc((unsigned)(sizeof(wordcount_T) + STRLEN(p))); 2087 if (wc == NULL) 2088 return; 2089 STRCPY(wc->wc_word, p); 2090 wc->wc_count = count; 2091 hash_add_item(&lp->sl_wordcount, hi, wc->wc_word, hash); 2092 } 2093 else 2094 { 2095 wc = HI2WC(hi); 2096 if ((wc->wc_count += count) < (unsigned)count) /* check for overflow */ 2097 wc->wc_count = MAXWORDCOUNT; 2098 } 2099 } 2100 2101 /* 2102 * Adjust the score of common words. 2103 */ 2104 static int 2105 score_wordcount_adj( 2106 slang_T *slang, 2107 int score, 2108 char_u *word, 2109 int split) /* word was split, less bonus */ 2110 { 2111 hashitem_T *hi; 2112 wordcount_T *wc; 2113 int bonus; 2114 int newscore; 2115 2116 hi = hash_find(&slang->sl_wordcount, word); 2117 if (!HASHITEM_EMPTY(hi)) 2118 { 2119 wc = HI2WC(hi); 2120 if (wc->wc_count < SCORE_THRES2) 2121 bonus = SCORE_COMMON1; 2122 else if (wc->wc_count < SCORE_THRES3) 2123 bonus = SCORE_COMMON2; 2124 else 2125 bonus = SCORE_COMMON3; 2126 if (split) 2127 newscore = score - bonus / 2; 2128 else 2129 newscore = score - bonus; 2130 if (newscore < 0) 2131 return 0; 2132 return newscore; 2133 } 2134 return score; 2135 } 2136 2137 2138 /* 2139 * Return TRUE if byte "n" appears in "str". 2140 * Like strchr() but independent of locale. 2141 */ 2142 int 2143 byte_in_str(char_u *str, int n) 2144 { 2145 char_u *p; 2146 2147 for (p = str; *p != NUL; ++p) 2148 if (*p == n) 2149 return TRUE; 2150 return FALSE; 2151 } 2152 2153 #define SY_MAXLEN 30 2154 typedef struct syl_item_S 2155 { 2156 char_u sy_chars[SY_MAXLEN]; /* the sequence of chars */ 2157 int sy_len; 2158 } syl_item_T; 2159 2160 /* 2161 * Truncate "slang->sl_syllable" at the first slash and put the following items 2162 * in "slang->sl_syl_items". 2163 */ 2164 int 2165 init_syl_tab(slang_T *slang) 2166 { 2167 char_u *p; 2168 char_u *s; 2169 int l; 2170 syl_item_T *syl; 2171 2172 ga_init2(&slang->sl_syl_items, sizeof(syl_item_T), 4); 2173 p = vim_strchr(slang->sl_syllable, '/'); 2174 while (p != NULL) 2175 { 2176 *p++ = NUL; 2177 if (*p == NUL) /* trailing slash */ 2178 break; 2179 s = p; 2180 p = vim_strchr(p, '/'); 2181 if (p == NULL) 2182 l = (int)STRLEN(s); 2183 else 2184 l = (int)(p - s); 2185 if (l >= SY_MAXLEN) 2186 return SP_FORMERROR; 2187 if (ga_grow(&slang->sl_syl_items, 1) == FAIL) 2188 return SP_OTHERERROR; 2189 syl = ((syl_item_T *)slang->sl_syl_items.ga_data) 2190 + slang->sl_syl_items.ga_len++; 2191 vim_strncpy(syl->sy_chars, s, l); 2192 syl->sy_len = l; 2193 } 2194 return OK; 2195 } 2196 2197 /* 2198 * Count the number of syllables in "word". 2199 * When "word" contains spaces the syllables after the last space are counted. 2200 * Returns zero if syllables are not defines. 2201 */ 2202 static int 2203 count_syllables(slang_T *slang, char_u *word) 2204 { 2205 int cnt = 0; 2206 int skip = FALSE; 2207 char_u *p; 2208 int len; 2209 int i; 2210 syl_item_T *syl; 2211 int c; 2212 2213 if (slang->sl_syllable == NULL) 2214 return 0; 2215 2216 for (p = word; *p != NUL; p += len) 2217 { 2218 /* When running into a space reset counter. */ 2219 if (*p == ' ') 2220 { 2221 len = 1; 2222 cnt = 0; 2223 continue; 2224 } 2225 2226 /* Find longest match of syllable items. */ 2227 len = 0; 2228 for (i = 0; i < slang->sl_syl_items.ga_len; ++i) 2229 { 2230 syl = ((syl_item_T *)slang->sl_syl_items.ga_data) + i; 2231 if (syl->sy_len > len 2232 && STRNCMP(p, syl->sy_chars, syl->sy_len) == 0) 2233 len = syl->sy_len; 2234 } 2235 if (len != 0) /* found a match, count syllable */ 2236 { 2237 ++cnt; 2238 skip = FALSE; 2239 } 2240 else 2241 { 2242 /* No recognized syllable item, at least a syllable char then? */ 2243 c = mb_ptr2char(p); 2244 len = (*mb_ptr2len)(p); 2245 if (vim_strchr(slang->sl_syllable, c) == NULL) 2246 skip = FALSE; /* No, search for next syllable */ 2247 else if (!skip) 2248 { 2249 ++cnt; /* Yes, count it */ 2250 skip = TRUE; /* don't count following syllable chars */ 2251 } 2252 } 2253 } 2254 return cnt; 2255 } 2256 2257 /* 2258 * Parse 'spelllang' and set w_s->b_langp accordingly. 2259 * Returns NULL if it's OK, an error message otherwise. 2260 */ 2261 char * 2262 did_set_spelllang(win_T *wp) 2263 { 2264 garray_T ga; 2265 char_u *splp; 2266 char_u *region; 2267 char_u region_cp[3]; 2268 int filename; 2269 int region_mask; 2270 slang_T *slang; 2271 int c; 2272 char_u lang[MAXWLEN + 1]; 2273 char_u spf_name[MAXPATHL]; 2274 int len; 2275 char_u *p; 2276 int round; 2277 char_u *spf; 2278 char_u *use_region = NULL; 2279 int dont_use_region = FALSE; 2280 int nobreak = FALSE; 2281 int i, j; 2282 langp_T *lp, *lp2; 2283 static int recursive = FALSE; 2284 char *ret_msg = NULL; 2285 char_u *spl_copy; 2286 bufref_T bufref; 2287 2288 set_bufref(&bufref, wp->w_buffer); 2289 2290 /* We don't want to do this recursively. May happen when a language is 2291 * not available and the SpellFileMissing autocommand opens a new buffer 2292 * in which 'spell' is set. */ 2293 if (recursive) 2294 return NULL; 2295 recursive = TRUE; 2296 2297 ga_init2(&ga, sizeof(langp_T), 2); 2298 clear_midword(wp); 2299 2300 /* Make a copy of 'spelllang', the SpellFileMissing autocommands may change 2301 * it under our fingers. */ 2302 spl_copy = vim_strsave(wp->w_s->b_p_spl); 2303 if (spl_copy == NULL) 2304 goto theend; 2305 2306 wp->w_s->b_cjk = 0; 2307 2308 /* Loop over comma separated language names. */ 2309 for (splp = spl_copy; *splp != NUL; ) 2310 { 2311 // Get one language name. 2312 copy_option_part(&splp, lang, MAXWLEN, ","); 2313 region = NULL; 2314 len = (int)STRLEN(lang); 2315 2316 if (!valid_spellang(lang)) 2317 continue; 2318 2319 if (STRCMP(lang, "cjk") == 0) 2320 { 2321 wp->w_s->b_cjk = 1; 2322 continue; 2323 } 2324 2325 /* If the name ends in ".spl" use it as the name of the spell file. 2326 * If there is a region name let "region" point to it and remove it 2327 * from the name. */ 2328 if (len > 4 && fnamecmp(lang + len - 4, ".spl") == 0) 2329 { 2330 filename = TRUE; 2331 2332 /* Locate a region and remove it from the file name. */ 2333 p = vim_strchr(gettail(lang), '_'); 2334 if (p != NULL && ASCII_ISALPHA(p[1]) && ASCII_ISALPHA(p[2]) 2335 && !ASCII_ISALPHA(p[3])) 2336 { 2337 vim_strncpy(region_cp, p + 1, 2); 2338 mch_memmove(p, p + 3, len - (p - lang) - 2); 2339 region = region_cp; 2340 } 2341 else 2342 dont_use_region = TRUE; 2343 2344 /* Check if we loaded this language before. */ 2345 for (slang = first_lang; slang != NULL; slang = slang->sl_next) 2346 if (fullpathcmp(lang, slang->sl_fname, FALSE) == FPC_SAME) 2347 break; 2348 } 2349 else 2350 { 2351 filename = FALSE; 2352 if (len > 3 && lang[len - 3] == '_') 2353 { 2354 region = lang + len - 2; 2355 len -= 3; 2356 lang[len] = NUL; 2357 } 2358 else 2359 dont_use_region = TRUE; 2360 2361 /* Check if we loaded this language before. */ 2362 for (slang = first_lang; slang != NULL; slang = slang->sl_next) 2363 if (STRICMP(lang, slang->sl_name) == 0) 2364 break; 2365 } 2366 2367 if (region != NULL) 2368 { 2369 /* If the region differs from what was used before then don't 2370 * use it for 'spellfile'. */ 2371 if (use_region != NULL && STRCMP(region, use_region) != 0) 2372 dont_use_region = TRUE; 2373 use_region = region; 2374 } 2375 2376 /* If not found try loading the language now. */ 2377 if (slang == NULL) 2378 { 2379 if (filename) 2380 (void)spell_load_file(lang, lang, NULL, FALSE); 2381 else 2382 { 2383 spell_load_lang(lang); 2384 /* SpellFileMissing autocommands may do anything, including 2385 * destroying the buffer we are using... */ 2386 if (!bufref_valid(&bufref)) 2387 { 2388 ret_msg = N_("E797: SpellFileMissing autocommand deleted buffer"); 2389 goto theend; 2390 } 2391 } 2392 } 2393 2394 /* 2395 * Loop over the languages, there can be several files for "lang". 2396 */ 2397 for (slang = first_lang; slang != NULL; slang = slang->sl_next) 2398 if (filename ? fullpathcmp(lang, slang->sl_fname, FALSE) == FPC_SAME 2399 : STRICMP(lang, slang->sl_name) == 0) 2400 { 2401 region_mask = REGION_ALL; 2402 if (!filename && region != NULL) 2403 { 2404 /* find region in sl_regions */ 2405 c = find_region(slang->sl_regions, region); 2406 if (c == REGION_ALL) 2407 { 2408 if (slang->sl_add) 2409 { 2410 if (*slang->sl_regions != NUL) 2411 /* This addition file is for other regions. */ 2412 region_mask = 0; 2413 } 2414 else 2415 /* This is probably an error. Give a warning and 2416 * accept the words anyway. */ 2417 smsg(_("Warning: region %s not supported"), 2418 region); 2419 } 2420 else 2421 region_mask = 1 << c; 2422 } 2423 2424 if (region_mask != 0) 2425 { 2426 if (ga_grow(&ga, 1) == FAIL) 2427 { 2428 ga_clear(&ga); 2429 ret_msg = e_outofmem; 2430 goto theend; 2431 } 2432 LANGP_ENTRY(ga, ga.ga_len)->lp_slang = slang; 2433 LANGP_ENTRY(ga, ga.ga_len)->lp_region = region_mask; 2434 ++ga.ga_len; 2435 use_midword(slang, wp); 2436 if (slang->sl_nobreak) 2437 nobreak = TRUE; 2438 } 2439 } 2440 } 2441 2442 /* round 0: load int_wordlist, if possible. 2443 * round 1: load first name in 'spellfile'. 2444 * round 2: load second name in 'spellfile. 2445 * etc. */ 2446 spf = curwin->w_s->b_p_spf; 2447 for (round = 0; round == 0 || *spf != NUL; ++round) 2448 { 2449 if (round == 0) 2450 { 2451 /* Internal wordlist, if there is one. */ 2452 if (int_wordlist == NULL) 2453 continue; 2454 int_wordlist_spl(spf_name); 2455 } 2456 else 2457 { 2458 /* One entry in 'spellfile'. */ 2459 copy_option_part(&spf, spf_name, MAXPATHL - 5, ","); 2460 STRCAT(spf_name, ".spl"); 2461 2462 /* If it was already found above then skip it. */ 2463 for (c = 0; c < ga.ga_len; ++c) 2464 { 2465 p = LANGP_ENTRY(ga, c)->lp_slang->sl_fname; 2466 if (p != NULL && fullpathcmp(spf_name, p, FALSE) == FPC_SAME) 2467 break; 2468 } 2469 if (c < ga.ga_len) 2470 continue; 2471 } 2472 2473 /* Check if it was loaded already. */ 2474 for (slang = first_lang; slang != NULL; slang = slang->sl_next) 2475 if (fullpathcmp(spf_name, slang->sl_fname, FALSE) == FPC_SAME) 2476 break; 2477 if (slang == NULL) 2478 { 2479 /* Not loaded, try loading it now. The language name includes the 2480 * region name, the region is ignored otherwise. for int_wordlist 2481 * use an arbitrary name. */ 2482 if (round == 0) 2483 STRCPY(lang, "internal wordlist"); 2484 else 2485 { 2486 vim_strncpy(lang, gettail(spf_name), MAXWLEN); 2487 p = vim_strchr(lang, '.'); 2488 if (p != NULL) 2489 *p = NUL; /* truncate at ".encoding.add" */ 2490 } 2491 slang = spell_load_file(spf_name, lang, NULL, TRUE); 2492 2493 /* If one of the languages has NOBREAK we assume the addition 2494 * files also have this. */ 2495 if (slang != NULL && nobreak) 2496 slang->sl_nobreak = TRUE; 2497 } 2498 if (slang != NULL && ga_grow(&ga, 1) == OK) 2499 { 2500 region_mask = REGION_ALL; 2501 if (use_region != NULL && !dont_use_region) 2502 { 2503 /* find region in sl_regions */ 2504 c = find_region(slang->sl_regions, use_region); 2505 if (c != REGION_ALL) 2506 region_mask = 1 << c; 2507 else if (*slang->sl_regions != NUL) 2508 /* This spell file is for other regions. */ 2509 region_mask = 0; 2510 } 2511 2512 if (region_mask != 0) 2513 { 2514 LANGP_ENTRY(ga, ga.ga_len)->lp_slang = slang; 2515 LANGP_ENTRY(ga, ga.ga_len)->lp_sallang = NULL; 2516 LANGP_ENTRY(ga, ga.ga_len)->lp_replang = NULL; 2517 LANGP_ENTRY(ga, ga.ga_len)->lp_region = region_mask; 2518 ++ga.ga_len; 2519 use_midword(slang, wp); 2520 } 2521 } 2522 } 2523 2524 /* Everything is fine, store the new b_langp value. */ 2525 ga_clear(&wp->w_s->b_langp); 2526 wp->w_s->b_langp = ga; 2527 2528 /* For each language figure out what language to use for sound folding and 2529 * REP items. If the language doesn't support it itself use another one 2530 * with the same name. E.g. for "en-math" use "en". */ 2531 for (i = 0; i < ga.ga_len; ++i) 2532 { 2533 lp = LANGP_ENTRY(ga, i); 2534 2535 /* sound folding */ 2536 if (lp->lp_slang->sl_sal.ga_len > 0) 2537 /* language does sound folding itself */ 2538 lp->lp_sallang = lp->lp_slang; 2539 else 2540 /* find first similar language that does sound folding */ 2541 for (j = 0; j < ga.ga_len; ++j) 2542 { 2543 lp2 = LANGP_ENTRY(ga, j); 2544 if (lp2->lp_slang->sl_sal.ga_len > 0 2545 && STRNCMP(lp->lp_slang->sl_name, 2546 lp2->lp_slang->sl_name, 2) == 0) 2547 { 2548 lp->lp_sallang = lp2->lp_slang; 2549 break; 2550 } 2551 } 2552 2553 /* REP items */ 2554 if (lp->lp_slang->sl_rep.ga_len > 0) 2555 /* language has REP items itself */ 2556 lp->lp_replang = lp->lp_slang; 2557 else 2558 /* find first similar language that has REP items */ 2559 for (j = 0; j < ga.ga_len; ++j) 2560 { 2561 lp2 = LANGP_ENTRY(ga, j); 2562 if (lp2->lp_slang->sl_rep.ga_len > 0 2563 && STRNCMP(lp->lp_slang->sl_name, 2564 lp2->lp_slang->sl_name, 2) == 0) 2565 { 2566 lp->lp_replang = lp2->lp_slang; 2567 break; 2568 } 2569 } 2570 } 2571 2572 theend: 2573 vim_free(spl_copy); 2574 recursive = FALSE; 2575 redraw_win_later(wp, NOT_VALID); 2576 return ret_msg; 2577 } 2578 2579 /* 2580 * Clear the midword characters for buffer "buf". 2581 */ 2582 static void 2583 clear_midword(win_T *wp) 2584 { 2585 vim_memset(wp->w_s->b_spell_ismw, 0, 256); 2586 VIM_CLEAR(wp->w_s->b_spell_ismw_mb); 2587 } 2588 2589 /* 2590 * Use the "sl_midword" field of language "lp" for buffer "buf". 2591 * They add up to any currently used midword characters. 2592 */ 2593 static void 2594 use_midword(slang_T *lp, win_T *wp) 2595 { 2596 char_u *p; 2597 2598 if (lp->sl_midword == NULL) /* there aren't any */ 2599 return; 2600 2601 for (p = lp->sl_midword; *p != NUL; ) 2602 if (has_mbyte) 2603 { 2604 int c, l, n; 2605 char_u *bp; 2606 2607 c = mb_ptr2char(p); 2608 l = (*mb_ptr2len)(p); 2609 if (c < 256 && l <= 2) 2610 wp->w_s->b_spell_ismw[c] = TRUE; 2611 else if (wp->w_s->b_spell_ismw_mb == NULL) 2612 /* First multi-byte char in "b_spell_ismw_mb". */ 2613 wp->w_s->b_spell_ismw_mb = vim_strnsave(p, l); 2614 else 2615 { 2616 /* Append multi-byte chars to "b_spell_ismw_mb". */ 2617 n = (int)STRLEN(wp->w_s->b_spell_ismw_mb); 2618 bp = vim_strnsave(wp->w_s->b_spell_ismw_mb, n + l); 2619 if (bp != NULL) 2620 { 2621 vim_free(wp->w_s->b_spell_ismw_mb); 2622 wp->w_s->b_spell_ismw_mb = bp; 2623 vim_strncpy(bp + n, p, l); 2624 } 2625 } 2626 p += l; 2627 } 2628 else 2629 wp->w_s->b_spell_ismw[*p++] = TRUE; 2630 } 2631 2632 /* 2633 * Find the region "region[2]" in "rp" (points to "sl_regions"). 2634 * Each region is simply stored as the two characters of its name. 2635 * Returns the index if found (first is 0), REGION_ALL if not found. 2636 */ 2637 static int 2638 find_region(char_u *rp, char_u *region) 2639 { 2640 int i; 2641 2642 for (i = 0; ; i += 2) 2643 { 2644 if (rp[i] == NUL) 2645 return REGION_ALL; 2646 if (rp[i] == region[0] && rp[i + 1] == region[1]) 2647 break; 2648 } 2649 return i / 2; 2650 } 2651 2652 /* 2653 * Return case type of word: 2654 * w word 0 2655 * Word WF_ONECAP 2656 * W WORD WF_ALLCAP 2657 * WoRd wOrd WF_KEEPCAP 2658 */ 2659 int 2660 captype( 2661 char_u *word, 2662 char_u *end) /* When NULL use up to NUL byte. */ 2663 { 2664 char_u *p; 2665 int c; 2666 int firstcap; 2667 int allcap; 2668 int past_second = FALSE; /* past second word char */ 2669 2670 /* find first letter */ 2671 for (p = word; !spell_iswordp_nmw(p, curwin); MB_PTR_ADV(p)) 2672 if (end == NULL ? *p == NUL : p >= end) 2673 return 0; /* only non-word characters, illegal word */ 2674 if (has_mbyte) 2675 c = mb_ptr2char_adv(&p); 2676 else 2677 c = *p++; 2678 firstcap = allcap = SPELL_ISUPPER(c); 2679 2680 /* 2681 * Need to check all letters to find a word with mixed upper/lower. 2682 * But a word with an upper char only at start is a ONECAP. 2683 */ 2684 for ( ; end == NULL ? *p != NUL : p < end; MB_PTR_ADV(p)) 2685 if (spell_iswordp_nmw(p, curwin)) 2686 { 2687 c = PTR2CHAR(p); 2688 if (!SPELL_ISUPPER(c)) 2689 { 2690 /* UUl -> KEEPCAP */ 2691 if (past_second && allcap) 2692 return WF_KEEPCAP; 2693 allcap = FALSE; 2694 } 2695 else if (!allcap) 2696 /* UlU -> KEEPCAP */ 2697 return WF_KEEPCAP; 2698 past_second = TRUE; 2699 } 2700 2701 if (allcap) 2702 return WF_ALLCAP; 2703 if (firstcap) 2704 return WF_ONECAP; 2705 return 0; 2706 } 2707 2708 /* 2709 * Like captype() but for a KEEPCAP word add ONECAP if the word starts with a 2710 * capital. So that make_case_word() can turn WOrd into Word. 2711 * Add ALLCAP for "WOrD". 2712 */ 2713 static int 2714 badword_captype(char_u *word, char_u *end) 2715 { 2716 int flags = captype(word, end); 2717 int c; 2718 int l, u; 2719 int first; 2720 char_u *p; 2721 2722 if (flags & WF_KEEPCAP) 2723 { 2724 /* Count the number of UPPER and lower case letters. */ 2725 l = u = 0; 2726 first = FALSE; 2727 for (p = word; p < end; MB_PTR_ADV(p)) 2728 { 2729 c = PTR2CHAR(p); 2730 if (SPELL_ISUPPER(c)) 2731 { 2732 ++u; 2733 if (p == word) 2734 first = TRUE; 2735 } 2736 else 2737 ++l; 2738 } 2739 2740 /* If there are more UPPER than lower case letters suggest an 2741 * ALLCAP word. Otherwise, if the first letter is UPPER then 2742 * suggest ONECAP. Exception: "ALl" most likely should be "All", 2743 * require three upper case letters. */ 2744 if (u > l && u > 2) 2745 flags |= WF_ALLCAP; 2746 else if (first) 2747 flags |= WF_ONECAP; 2748 2749 if (u >= 2 && l >= 2) /* maCARONI maCAroni */ 2750 flags |= WF_MIXCAP; 2751 } 2752 return flags; 2753 } 2754 2755 /* 2756 * Delete the internal wordlist and its .spl file. 2757 */ 2758 void 2759 spell_delete_wordlist(void) 2760 { 2761 char_u fname[MAXPATHL]; 2762 2763 if (int_wordlist != NULL) 2764 { 2765 mch_remove(int_wordlist); 2766 int_wordlist_spl(fname); 2767 mch_remove(fname); 2768 VIM_CLEAR(int_wordlist); 2769 } 2770 } 2771 2772 /* 2773 * Free all languages. 2774 */ 2775 void 2776 spell_free_all(void) 2777 { 2778 slang_T *slang; 2779 buf_T *buf; 2780 2781 /* Go through all buffers and handle 'spelllang'. <VN> */ 2782 FOR_ALL_BUFFERS(buf) 2783 ga_clear(&buf->b_s.b_langp); 2784 2785 while (first_lang != NULL) 2786 { 2787 slang = first_lang; 2788 first_lang = slang->sl_next; 2789 slang_free(slang); 2790 } 2791 2792 spell_delete_wordlist(); 2793 2794 VIM_CLEAR(repl_to); 2795 VIM_CLEAR(repl_from); 2796 } 2797 2798 /* 2799 * Clear all spelling tables and reload them. 2800 * Used after 'encoding' is set and when ":mkspell" was used. 2801 */ 2802 void 2803 spell_reload(void) 2804 { 2805 win_T *wp; 2806 2807 /* Initialize the table for spell_iswordp(). */ 2808 init_spell_chartab(); 2809 2810 /* Unload all allocated memory. */ 2811 spell_free_all(); 2812 2813 /* Go through all buffers and handle 'spelllang'. */ 2814 FOR_ALL_WINDOWS(wp) 2815 { 2816 /* Only load the wordlists when 'spelllang' is set and there is a 2817 * window for this buffer in which 'spell' is set. */ 2818 if (*wp->w_s->b_p_spl != NUL) 2819 { 2820 if (wp->w_p_spell) 2821 { 2822 (void)did_set_spelllang(wp); 2823 break; 2824 } 2825 } 2826 } 2827 } 2828 2829 /* 2830 * Opposite of offset2bytes(). 2831 * "pp" points to the bytes and is advanced over it. 2832 * Returns the offset. 2833 */ 2834 static int 2835 bytes2offset(char_u **pp) 2836 { 2837 char_u *p = *pp; 2838 int nr; 2839 int c; 2840 2841 c = *p++; 2842 if ((c & 0x80) == 0x00) /* 1 byte */ 2843 { 2844 nr = c - 1; 2845 } 2846 else if ((c & 0xc0) == 0x80) /* 2 bytes */ 2847 { 2848 nr = (c & 0x3f) - 1; 2849 nr = nr * 255 + (*p++ - 1); 2850 } 2851 else if ((c & 0xe0) == 0xc0) /* 3 bytes */ 2852 { 2853 nr = (c & 0x1f) - 1; 2854 nr = nr * 255 + (*p++ - 1); 2855 nr = nr * 255 + (*p++ - 1); 2856 } 2857 else /* 4 bytes */ 2858 { 2859 nr = (c & 0x0f) - 1; 2860 nr = nr * 255 + (*p++ - 1); 2861 nr = nr * 255 + (*p++ - 1); 2862 nr = nr * 255 + (*p++ - 1); 2863 } 2864 2865 *pp = p; 2866 return nr; 2867 } 2868 2869 2870 /* 2871 * Open a spell buffer. This is a nameless buffer that is not in the buffer 2872 * list and only contains text lines. Can use a swapfile to reduce memory 2873 * use. 2874 * Most other fields are invalid! Esp. watch out for string options being 2875 * NULL and there is no undo info. 2876 * Returns NULL when out of memory. 2877 */ 2878 buf_T * 2879 open_spellbuf(void) 2880 { 2881 buf_T *buf; 2882 2883 buf = (buf_T *)alloc_clear(sizeof(buf_T)); 2884 if (buf != NULL) 2885 { 2886 buf->b_spell = TRUE; 2887 buf->b_p_swf = TRUE; /* may create a swap file */ 2888 #ifdef FEAT_CRYPT 2889 buf->b_p_key = empty_option; 2890 #endif 2891 ml_open(buf); 2892 ml_open_file(buf); /* create swap file now */ 2893 } 2894 return buf; 2895 } 2896 2897 /* 2898 * Close the buffer used for spell info. 2899 */ 2900 void 2901 close_spellbuf(buf_T *buf) 2902 { 2903 if (buf != NULL) 2904 { 2905 ml_close(buf, TRUE); 2906 vim_free(buf); 2907 } 2908 } 2909 2910 /* 2911 * Init the chartab used for spelling for ASCII. 2912 * EBCDIC is not supported! 2913 */ 2914 void 2915 clear_spell_chartab(spelltab_T *sp) 2916 { 2917 int i; 2918 2919 /* Init everything to FALSE. */ 2920 vim_memset(sp->st_isw, FALSE, sizeof(sp->st_isw)); 2921 vim_memset(sp->st_isu, FALSE, sizeof(sp->st_isu)); 2922 for (i = 0; i < 256; ++i) 2923 { 2924 sp->st_fold[i] = i; 2925 sp->st_upper[i] = i; 2926 } 2927 2928 /* We include digits. A word shouldn't start with a digit, but handling 2929 * that is done separately. */ 2930 for (i = '0'; i <= '9'; ++i) 2931 sp->st_isw[i] = TRUE; 2932 for (i = 'A'; i <= 'Z'; ++i) 2933 { 2934 sp->st_isw[i] = TRUE; 2935 sp->st_isu[i] = TRUE; 2936 sp->st_fold[i] = i + 0x20; 2937 } 2938 for (i = 'a'; i <= 'z'; ++i) 2939 { 2940 sp->st_isw[i] = TRUE; 2941 sp->st_upper[i] = i - 0x20; 2942 } 2943 } 2944 2945 /* 2946 * Init the chartab used for spelling. Only depends on 'encoding'. 2947 * Called once while starting up and when 'encoding' changes. 2948 * The default is to use isalpha(), but the spell file should define the word 2949 * characters to make it possible that 'encoding' differs from the current 2950 * locale. For utf-8 we don't use isalpha() but our own functions. 2951 */ 2952 void 2953 init_spell_chartab(void) 2954 { 2955 int i; 2956 2957 did_set_spelltab = FALSE; 2958 clear_spell_chartab(&spelltab); 2959 if (enc_dbcs) 2960 { 2961 /* DBCS: assume double-wide characters are word characters. */ 2962 for (i = 128; i <= 255; ++i) 2963 if (MB_BYTE2LEN(i) == 2) 2964 spelltab.st_isw[i] = TRUE; 2965 } 2966 else if (enc_utf8) 2967 { 2968 for (i = 128; i < 256; ++i) 2969 { 2970 int f = utf_fold(i); 2971 int u = utf_toupper(i); 2972 2973 spelltab.st_isu[i] = utf_isupper(i); 2974 spelltab.st_isw[i] = spelltab.st_isu[i] || utf_islower(i); 2975 /* The folded/upper-cased value is different between latin1 and 2976 * utf8 for 0xb5, causing E763 for no good reason. Use the latin1 2977 * value for utf-8 to avoid this. */ 2978 spelltab.st_fold[i] = (f < 256) ? f : i; 2979 spelltab.st_upper[i] = (u < 256) ? u : i; 2980 } 2981 } 2982 else 2983 { 2984 /* Rough guess: use locale-dependent library functions. */ 2985 for (i = 128; i < 256; ++i) 2986 { 2987 if (MB_ISUPPER(i)) 2988 { 2989 spelltab.st_isw[i] = TRUE; 2990 spelltab.st_isu[i] = TRUE; 2991 spelltab.st_fold[i] = MB_TOLOWER(i); 2992 } 2993 else if (MB_ISLOWER(i)) 2994 { 2995 spelltab.st_isw[i] = TRUE; 2996 spelltab.st_upper[i] = MB_TOUPPER(i); 2997 } 2998 } 2999 } 3000 } 3001 3002 3003 /* 3004 * Return TRUE if "p" points to a word character. 3005 * As a special case we see "midword" characters as word character when it is 3006 * followed by a word character. This finds they'there but not 'they there'. 3007 * Thus this only works properly when past the first character of the word. 3008 */ 3009 static int 3010 spell_iswordp( 3011 char_u *p, 3012 win_T *wp) /* buffer used */ 3013 { 3014 char_u *s; 3015 int l; 3016 int c; 3017 3018 if (has_mbyte) 3019 { 3020 l = MB_PTR2LEN(p); 3021 s = p; 3022 if (l == 1) 3023 { 3024 /* be quick for ASCII */ 3025 if (wp->w_s->b_spell_ismw[*p]) 3026 s = p + 1; /* skip a mid-word character */ 3027 } 3028 else 3029 { 3030 c = mb_ptr2char(p); 3031 if (c < 256 ? wp->w_s->b_spell_ismw[c] 3032 : (wp->w_s->b_spell_ismw_mb != NULL 3033 && vim_strchr(wp->w_s->b_spell_ismw_mb, c) != NULL)) 3034 s = p + l; 3035 } 3036 3037 c = mb_ptr2char(s); 3038 if (c > 255) 3039 return spell_mb_isword_class(mb_get_class(s), wp); 3040 return spelltab.st_isw[c]; 3041 } 3042 3043 return spelltab.st_isw[wp->w_s->b_spell_ismw[*p] ? p[1] : p[0]]; 3044 } 3045 3046 /* 3047 * Return TRUE if "p" points to a word character. 3048 * Unlike spell_iswordp() this doesn't check for "midword" characters. 3049 */ 3050 int 3051 spell_iswordp_nmw(char_u *p, win_T *wp) 3052 { 3053 int c; 3054 3055 if (has_mbyte) 3056 { 3057 c = mb_ptr2char(p); 3058 if (c > 255) 3059 return spell_mb_isword_class(mb_get_class(p), wp); 3060 return spelltab.st_isw[c]; 3061 } 3062 return spelltab.st_isw[*p]; 3063 } 3064 3065 /* 3066 * Return TRUE if word class indicates a word character. 3067 * Only for characters above 255. 3068 * Unicode subscript and superscript are not considered word characters. 3069 * See also dbcs_class() and utf_class() in mbyte.c. 3070 */ 3071 static int 3072 spell_mb_isword_class(int cl, win_T *wp) 3073 { 3074 if (wp->w_s->b_cjk) 3075 /* East Asian characters are not considered word characters. */ 3076 return cl == 2 || cl == 0x2800; 3077 return cl >= 2 && cl != 0x2070 && cl != 0x2080; 3078 } 3079 3080 /* 3081 * Return TRUE if "p" points to a word character. 3082 * Wide version of spell_iswordp(). 3083 */ 3084 static int 3085 spell_iswordp_w(int *p, win_T *wp) 3086 { 3087 int *s; 3088 3089 if (*p < 256 ? wp->w_s->b_spell_ismw[*p] 3090 : (wp->w_s->b_spell_ismw_mb != NULL 3091 && vim_strchr(wp->w_s->b_spell_ismw_mb, *p) != NULL)) 3092 s = p + 1; 3093 else 3094 s = p; 3095 3096 if (*s > 255) 3097 { 3098 if (enc_utf8) 3099 return spell_mb_isword_class(utf_class(*s), wp); 3100 if (enc_dbcs) 3101 return spell_mb_isword_class( 3102 dbcs_class((unsigned)*s >> 8, *s & 0xff), wp); 3103 return 0; 3104 } 3105 return spelltab.st_isw[*s]; 3106 } 3107 3108 /* 3109 * Case-fold "str[len]" into "buf[buflen]". The result is NUL terminated. 3110 * Uses the character definitions from the .spl file. 3111 * When using a multi-byte 'encoding' the length may change! 3112 * Returns FAIL when something wrong. 3113 */ 3114 int 3115 spell_casefold( 3116 char_u *str, 3117 int len, 3118 char_u *buf, 3119 int buflen) 3120 { 3121 int i; 3122 3123 if (len >= buflen) 3124 { 3125 buf[0] = NUL; 3126 return FAIL; /* result will not fit */ 3127 } 3128 3129 if (has_mbyte) 3130 { 3131 int outi = 0; 3132 char_u *p; 3133 int c; 3134 3135 /* Fold one character at a time. */ 3136 for (p = str; p < str + len; ) 3137 { 3138 if (outi + MB_MAXBYTES > buflen) 3139 { 3140 buf[outi] = NUL; 3141 return FAIL; 3142 } 3143 c = mb_cptr2char_adv(&p); 3144 outi += mb_char2bytes(SPELL_TOFOLD(c), buf + outi); 3145 } 3146 buf[outi] = NUL; 3147 } 3148 else 3149 { 3150 /* Be quick for non-multibyte encodings. */ 3151 for (i = 0; i < len; ++i) 3152 buf[i] = spelltab.st_fold[str[i]]; 3153 buf[i] = NUL; 3154 } 3155 3156 return OK; 3157 } 3158 3159 /* values for sps_flags */ 3160 #define SPS_BEST 1 3161 #define SPS_FAST 2 3162 #define SPS_DOUBLE 4 3163 3164 static int sps_flags = SPS_BEST; /* flags from 'spellsuggest' */ 3165 static int sps_limit = 9999; /* max nr of suggestions given */ 3166 3167 /* 3168 * Check the 'spellsuggest' option. Return FAIL if it's wrong. 3169 * Sets "sps_flags" and "sps_limit". 3170 */ 3171 int 3172 spell_check_sps(void) 3173 { 3174 char_u *p; 3175 char_u *s; 3176 char_u buf[MAXPATHL]; 3177 int f; 3178 3179 sps_flags = 0; 3180 sps_limit = 9999; 3181 3182 for (p = p_sps; *p != NUL; ) 3183 { 3184 copy_option_part(&p, buf, MAXPATHL, ","); 3185 3186 f = 0; 3187 if (VIM_ISDIGIT(*buf)) 3188 { 3189 s = buf; 3190 sps_limit = getdigits(&s); 3191 if (*s != NUL && !VIM_ISDIGIT(*s)) 3192 f = -1; 3193 } 3194 else if (STRCMP(buf, "best") == 0) 3195 f = SPS_BEST; 3196 else if (STRCMP(buf, "fast") == 0) 3197 f = SPS_FAST; 3198 else if (STRCMP(buf, "double") == 0) 3199 f = SPS_DOUBLE; 3200 else if (STRNCMP(buf, "expr:", 5) != 0 3201 && STRNCMP(buf, "file:", 5) != 0) 3202 f = -1; 3203 3204 if (f == -1 || (sps_flags != 0 && f != 0)) 3205 { 3206 sps_flags = SPS_BEST; 3207 sps_limit = 9999; 3208 return FAIL; 3209 } 3210 if (f != 0) 3211 sps_flags = f; 3212 } 3213 3214 if (sps_flags == 0) 3215 sps_flags = SPS_BEST; 3216 3217 return OK; 3218 } 3219 3220 /* 3221 * "z=": Find badly spelled word under or after the cursor. 3222 * Give suggestions for the properly spelled word. 3223 * In Visual mode use the highlighted word as the bad word. 3224 * When "count" is non-zero use that suggestion. 3225 */ 3226 void 3227 spell_suggest(int count) 3228 { 3229 char_u *line; 3230 pos_T prev_cursor = curwin->w_cursor; 3231 char_u wcopy[MAXWLEN + 2]; 3232 char_u *p; 3233 int i; 3234 int c; 3235 suginfo_T sug; 3236 suggest_T *stp; 3237 int mouse_used; 3238 int need_cap; 3239 int limit; 3240 int selected = count; 3241 int badlen = 0; 3242 int msg_scroll_save = msg_scroll; 3243 3244 if (no_spell_checking(curwin)) 3245 return; 3246 3247 if (VIsual_active) 3248 { 3249 /* Use the Visually selected text as the bad word. But reject 3250 * a multi-line selection. */ 3251 if (curwin->w_cursor.lnum != VIsual.lnum) 3252 { 3253 vim_beep(BO_SPELL); 3254 return; 3255 } 3256 badlen = (int)curwin->w_cursor.col - (int)VIsual.col; 3257 if (badlen < 0) 3258 badlen = -badlen; 3259 else 3260 curwin->w_cursor.col = VIsual.col; 3261 ++badlen; 3262 end_visual_mode(); 3263 } 3264 /* Find the start of the badly spelled word. */ 3265 else if (spell_move_to(curwin, FORWARD, TRUE, TRUE, NULL) == 0 3266 || curwin->w_cursor.col > prev_cursor.col) 3267 { 3268 /* No bad word or it starts after the cursor: use the word under the 3269 * cursor. */ 3270 curwin->w_cursor = prev_cursor; 3271 line = ml_get_curline(); 3272 p = line + curwin->w_cursor.col; 3273 /* Backup to before start of word. */ 3274 while (p > line && spell_iswordp_nmw(p, curwin)) 3275 MB_PTR_BACK(line, p); 3276 /* Forward to start of word. */ 3277 while (*p != NUL && !spell_iswordp_nmw(p, curwin)) 3278 MB_PTR_ADV(p); 3279 3280 if (!spell_iswordp_nmw(p, curwin)) /* No word found. */ 3281 { 3282 beep_flush(); 3283 return; 3284 } 3285 curwin->w_cursor.col = (colnr_T)(p - line); 3286 } 3287 3288 /* Get the word and its length. */ 3289 3290 /* Figure out if the word should be capitalised. */ 3291 need_cap = check_need_cap(curwin->w_cursor.lnum, curwin->w_cursor.col); 3292 3293 /* Make a copy of current line since autocommands may free the line. */ 3294 line = vim_strsave(ml_get_curline()); 3295 if (line == NULL) 3296 goto skip; 3297 3298 /* Get the list of suggestions. Limit to 'lines' - 2 or the number in 3299 * 'spellsuggest', whatever is smaller. */ 3300 if (sps_limit > (int)Rows - 2) 3301 limit = (int)Rows - 2; 3302 else 3303 limit = sps_limit; 3304 spell_find_suggest(line + curwin->w_cursor.col, badlen, &sug, limit, 3305 TRUE, need_cap, TRUE); 3306 3307 if (sug.su_ga.ga_len == 0) 3308 msg(_("Sorry, no suggestions")); 3309 else if (count > 0) 3310 { 3311 if (count > sug.su_ga.ga_len) 3312 smsg(_("Sorry, only %ld suggestions"), 3313 (long)sug.su_ga.ga_len); 3314 } 3315 else 3316 { 3317 VIM_CLEAR(repl_from); 3318 VIM_CLEAR(repl_to); 3319 3320 #ifdef FEAT_RIGHTLEFT 3321 /* When 'rightleft' is set the list is drawn right-left. */ 3322 cmdmsg_rl = curwin->w_p_rl; 3323 if (cmdmsg_rl) 3324 msg_col = Columns - 1; 3325 #endif 3326 3327 /* List the suggestions. */ 3328 msg_start(); 3329 msg_row = Rows - 1; /* for when 'cmdheight' > 1 */ 3330 lines_left = Rows; /* avoid more prompt */ 3331 vim_snprintf((char *)IObuff, IOSIZE, _("Change \"%.*s\" to:"), 3332 sug.su_badlen, sug.su_badptr); 3333 #ifdef FEAT_RIGHTLEFT 3334 if (cmdmsg_rl && STRNCMP(IObuff, "Change", 6) == 0) 3335 { 3336 /* And now the rabbit from the high hat: Avoid showing the 3337 * untranslated message rightleft. */ 3338 vim_snprintf((char *)IObuff, IOSIZE, ":ot \"%.*s\" egnahC", 3339 sug.su_badlen, sug.su_badptr); 3340 } 3341 #endif 3342 msg_puts((char *)IObuff); 3343 msg_clr_eos(); 3344 msg_putchar('\n'); 3345 3346 msg_scroll = TRUE; 3347 for (i = 0; i < sug.su_ga.ga_len; ++i) 3348 { 3349 stp = &SUG(sug.su_ga, i); 3350 3351 /* The suggested word may replace only part of the bad word, add 3352 * the not replaced part. */ 3353 vim_strncpy(wcopy, stp->st_word, MAXWLEN); 3354 if (sug.su_badlen > stp->st_orglen) 3355 vim_strncpy(wcopy + stp->st_wordlen, 3356 sug.su_badptr + stp->st_orglen, 3357 sug.su_badlen - stp->st_orglen); 3358 vim_snprintf((char *)IObuff, IOSIZE, "%2d", i + 1); 3359 #ifdef FEAT_RIGHTLEFT 3360 if (cmdmsg_rl) 3361 rl_mirror(IObuff); 3362 #endif 3363 msg_puts((char *)IObuff); 3364 3365 vim_snprintf((char *)IObuff, IOSIZE, " \"%s\"", wcopy); 3366 msg_puts((char *)IObuff); 3367 3368 /* The word may replace more than "su_badlen". */ 3369 if (sug.su_badlen < stp->st_orglen) 3370 { 3371 vim_snprintf((char *)IObuff, IOSIZE, _(" < \"%.*s\""), 3372 stp->st_orglen, sug.su_badptr); 3373 msg_puts((char *)IObuff); 3374 } 3375 3376 if (p_verbose > 0) 3377 { 3378 /* Add the score. */ 3379 if (sps_flags & (SPS_DOUBLE | SPS_BEST)) 3380 vim_snprintf((char *)IObuff, IOSIZE, " (%s%d - %d)", 3381 stp->st_salscore ? "s " : "", 3382 stp->st_score, stp->st_altscore); 3383 else 3384 vim_snprintf((char *)IObuff, IOSIZE, " (%d)", 3385 stp->st_score); 3386 #ifdef FEAT_RIGHTLEFT 3387 if (cmdmsg_rl) 3388 /* Mirror the numbers, but keep the leading space. */ 3389 rl_mirror(IObuff + 1); 3390 #endif 3391 msg_advance(30); 3392 msg_puts((char *)IObuff); 3393 } 3394 msg_putchar('\n'); 3395 } 3396 3397 #ifdef FEAT_RIGHTLEFT 3398 cmdmsg_rl = FALSE; 3399 msg_col = 0; 3400 #endif 3401 /* Ask for choice. */ 3402 selected = prompt_for_number(&mouse_used); 3403 if (mouse_used) 3404 selected -= lines_left; 3405 lines_left = Rows; /* avoid more prompt */ 3406 /* don't delay for 'smd' in normal_cmd() */ 3407 msg_scroll = msg_scroll_save; 3408 } 3409 3410 if (selected > 0 && selected <= sug.su_ga.ga_len && u_save_cursor() == OK) 3411 { 3412 /* Save the from and to text for :spellrepall. */ 3413 stp = &SUG(sug.su_ga, selected - 1); 3414 if (sug.su_badlen > stp->st_orglen) 3415 { 3416 /* Replacing less than "su_badlen", append the remainder to 3417 * repl_to. */ 3418 repl_from = vim_strnsave(sug.su_badptr, sug.su_badlen); 3419 vim_snprintf((char *)IObuff, IOSIZE, "%s%.*s", stp->st_word, 3420 sug.su_badlen - stp->st_orglen, 3421 sug.su_badptr + stp->st_orglen); 3422 repl_to = vim_strsave(IObuff); 3423 } 3424 else 3425 { 3426 /* Replacing su_badlen or more, use the whole word. */ 3427 repl_from = vim_strnsave(sug.su_badptr, stp->st_orglen); 3428 repl_to = vim_strsave(stp->st_word); 3429 } 3430 3431 /* Replace the word. */ 3432 p = alloc((unsigned)STRLEN(line) - stp->st_orglen 3433 + stp->st_wordlen + 1); 3434 if (p != NULL) 3435 { 3436 c = (int)(sug.su_badptr - line); 3437 mch_memmove(p, line, c); 3438 STRCPY(p + c, stp->st_word); 3439 STRCAT(p, sug.su_badptr + stp->st_orglen); 3440 ml_replace(curwin->w_cursor.lnum, p, FALSE); 3441 curwin->w_cursor.col = c; 3442 3443 /* For redo we use a change-word command. */ 3444 ResetRedobuff(); 3445 AppendToRedobuff((char_u *)"ciw"); 3446 AppendToRedobuffLit(p + c, 3447 stp->st_wordlen + sug.su_badlen - stp->st_orglen); 3448 AppendCharToRedobuff(ESC); 3449 3450 /* After this "p" may be invalid. */ 3451 changed_bytes(curwin->w_cursor.lnum, c); 3452 } 3453 } 3454 else 3455 curwin->w_cursor = prev_cursor; 3456 3457 spell_find_cleanup(&sug); 3458 skip: 3459 vim_free(line); 3460 } 3461 3462 /* 3463 * Check if the word at line "lnum" column "col" is required to start with a 3464 * capital. This uses 'spellcapcheck' of the current buffer. 3465 */ 3466 static int 3467 check_need_cap(linenr_T lnum, colnr_T col) 3468 { 3469 int need_cap = FALSE; 3470 char_u *line; 3471 char_u *line_copy = NULL; 3472 char_u *p; 3473 colnr_T endcol; 3474 regmatch_T regmatch; 3475 3476 if (curwin->w_s->b_cap_prog == NULL) 3477 return FALSE; 3478 3479 line = ml_get_curline(); 3480 endcol = 0; 3481 if (getwhitecols(line) >= (int)col) 3482 { 3483 /* At start of line, check if previous line is empty or sentence 3484 * ends there. */ 3485 if (lnum == 1) 3486 need_cap = TRUE; 3487 else 3488 { 3489 line = ml_get(lnum - 1); 3490 if (*skipwhite(line) == NUL) 3491 need_cap = TRUE; 3492 else 3493 { 3494 /* Append a space in place of the line break. */ 3495 line_copy = concat_str(line, (char_u *)" "); 3496 line = line_copy; 3497 endcol = (colnr_T)STRLEN(line); 3498 } 3499 } 3500 } 3501 else 3502 endcol = col; 3503 3504 if (endcol > 0) 3505 { 3506 /* Check if sentence ends before the bad word. */ 3507 regmatch.regprog = curwin->w_s->b_cap_prog; 3508 regmatch.rm_ic = FALSE; 3509 p = line + endcol; 3510 for (;;) 3511 { 3512 MB_PTR_BACK(line, p); 3513 if (p == line || spell_iswordp_nmw(p, curwin)) 3514 break; 3515 if (vim_regexec(®match, p, 0) 3516 && regmatch.endp[0] == line + endcol) 3517 { 3518 need_cap = TRUE; 3519 break; 3520 } 3521 } 3522 curwin->w_s->b_cap_prog = regmatch.regprog; 3523 } 3524 3525 vim_free(line_copy); 3526 3527 return need_cap; 3528 } 3529 3530 3531 /* 3532 * ":spellrepall" 3533 */ 3534 void 3535 ex_spellrepall(exarg_T *eap UNUSED) 3536 { 3537 pos_T pos = curwin->w_cursor; 3538 char_u *frompat; 3539 int addlen; 3540 char_u *line; 3541 char_u *p; 3542 int save_ws = p_ws; 3543 linenr_T prev_lnum = 0; 3544 3545 if (repl_from == NULL || repl_to == NULL) 3546 { 3547 emsg(_("E752: No previous spell replacement")); 3548 return; 3549 } 3550 addlen = (int)(STRLEN(repl_to) - STRLEN(repl_from)); 3551 3552 frompat = alloc((unsigned)STRLEN(repl_from) + 7); 3553 if (frompat == NULL) 3554 return; 3555 sprintf((char *)frompat, "\\V\\<%s\\>", repl_from); 3556 p_ws = FALSE; 3557 3558 sub_nsubs = 0; 3559 sub_nlines = 0; 3560 curwin->w_cursor.lnum = 0; 3561 while (!got_int) 3562 { 3563 if (do_search(NULL, '/', frompat, 1L, SEARCH_KEEP, NULL, NULL) == 0 3564 || u_save_cursor() == FAIL) 3565 break; 3566 3567 /* Only replace when the right word isn't there yet. This happens 3568 * when changing "etc" to "etc.". */ 3569 line = ml_get_curline(); 3570 if (addlen <= 0 || STRNCMP(line + curwin->w_cursor.col, 3571 repl_to, STRLEN(repl_to)) != 0) 3572 { 3573 p = alloc((unsigned)STRLEN(line) + addlen + 1); 3574 if (p == NULL) 3575 break; 3576 mch_memmove(p, line, curwin->w_cursor.col); 3577 STRCPY(p + curwin->w_cursor.col, repl_to); 3578 STRCAT(p, line + curwin->w_cursor.col + STRLEN(repl_from)); 3579 ml_replace(curwin->w_cursor.lnum, p, FALSE); 3580 changed_bytes(curwin->w_cursor.lnum, curwin->w_cursor.col); 3581 3582 if (curwin->w_cursor.lnum != prev_lnum) 3583 { 3584 ++sub_nlines; 3585 prev_lnum = curwin->w_cursor.lnum; 3586 } 3587 ++sub_nsubs; 3588 } 3589 curwin->w_cursor.col += (colnr_T)STRLEN(repl_to); 3590 } 3591 3592 p_ws = save_ws; 3593 curwin->w_cursor = pos; 3594 vim_free(frompat); 3595 3596 if (sub_nsubs == 0) 3597 semsg(_("E753: Not found: %s"), repl_from); 3598 else 3599 do_sub_msg(FALSE); 3600 } 3601 3602 /* 3603 * Find spell suggestions for "word". Return them in the growarray "*gap" as 3604 * a list of allocated strings. 3605 */ 3606 void 3607 spell_suggest_list( 3608 garray_T *gap, 3609 char_u *word, 3610 int maxcount, /* maximum nr of suggestions */ 3611 int need_cap, /* 'spellcapcheck' matched */ 3612 int interactive) 3613 { 3614 suginfo_T sug; 3615 int i; 3616 suggest_T *stp; 3617 char_u *wcopy; 3618 3619 spell_find_suggest(word, 0, &sug, maxcount, FALSE, need_cap, interactive); 3620 3621 /* Make room in "gap". */ 3622 ga_init2(gap, sizeof(char_u *), sug.su_ga.ga_len + 1); 3623 if (ga_grow(gap, sug.su_ga.ga_len) == OK) 3624 { 3625 for (i = 0; i < sug.su_ga.ga_len; ++i) 3626 { 3627 stp = &SUG(sug.su_ga, i); 3628 3629 /* The suggested word may replace only part of "word", add the not 3630 * replaced part. */ 3631 wcopy = alloc(stp->st_wordlen 3632 + (unsigned)STRLEN(sug.su_badptr + stp->st_orglen) + 1); 3633 if (wcopy == NULL) 3634 break; 3635 STRCPY(wcopy, stp->st_word); 3636 STRCPY(wcopy + stp->st_wordlen, sug.su_badptr + stp->st_orglen); 3637 ((char_u **)gap->ga_data)[gap->ga_len++] = wcopy; 3638 } 3639 } 3640 3641 spell_find_cleanup(&sug); 3642 } 3643 3644 /* 3645 * Find spell suggestions for the word at the start of "badptr". 3646 * Return the suggestions in "su->su_ga". 3647 * The maximum number of suggestions is "maxcount". 3648 * Note: does use info for the current window. 3649 * This is based on the mechanisms of Aspell, but completely reimplemented. 3650 */ 3651 static void 3652 spell_find_suggest( 3653 char_u *badptr, 3654 int badlen, /* length of bad word or 0 if unknown */ 3655 suginfo_T *su, 3656 int maxcount, 3657 int banbadword, /* don't include badword in suggestions */ 3658 int need_cap, /* word should start with capital */ 3659 int interactive) 3660 { 3661 hlf_T attr = HLF_COUNT; 3662 char_u buf[MAXPATHL]; 3663 char_u *p; 3664 int do_combine = FALSE; 3665 char_u *sps_copy; 3666 #ifdef FEAT_EVAL 3667 static int expr_busy = FALSE; 3668 #endif 3669 int c; 3670 int i; 3671 langp_T *lp; 3672 3673 /* 3674 * Set the info in "*su". 3675 */ 3676 vim_memset(su, 0, sizeof(suginfo_T)); 3677 ga_init2(&su->su_ga, (int)sizeof(suggest_T), 10); 3678 ga_init2(&su->su_sga, (int)sizeof(suggest_T), 10); 3679 if (*badptr == NUL) 3680 return; 3681 hash_init(&su->su_banned); 3682 3683 su->su_badptr = badptr; 3684 if (badlen != 0) 3685 su->su_badlen = badlen; 3686 else 3687 su->su_badlen = spell_check(curwin, su->su_badptr, &attr, NULL, FALSE); 3688 su->su_maxcount = maxcount; 3689 su->su_maxscore = SCORE_MAXINIT; 3690 3691 if (su->su_badlen >= MAXWLEN) 3692 su->su_badlen = MAXWLEN - 1; /* just in case */ 3693 vim_strncpy(su->su_badword, su->su_badptr, su->su_badlen); 3694 (void)spell_casefold(su->su_badptr, su->su_badlen, 3695 su->su_fbadword, MAXWLEN); 3696 /* TODO: make this work if the case-folded text is longer than the original 3697 * text. Currently an illegal byte causes wrong pointer computations. */ 3698 su->su_fbadword[su->su_badlen] = NUL; 3699 3700 /* get caps flags for bad word */ 3701 su->su_badflags = badword_captype(su->su_badptr, 3702 su->su_badptr + su->su_badlen); 3703 if (need_cap) 3704 su->su_badflags |= WF_ONECAP; 3705 3706 /* Find the default language for sound folding. We simply use the first 3707 * one in 'spelllang' that supports sound folding. That's good for when 3708 * using multiple files for one language, it's not that bad when mixing 3709 * languages (e.g., "pl,en"). */ 3710 for (i = 0; i < curbuf->b_s.b_langp.ga_len; ++i) 3711 { 3712 lp = LANGP_ENTRY(curbuf->b_s.b_langp, i); 3713 if (lp->lp_sallang != NULL) 3714 { 3715 su->su_sallang = lp->lp_sallang; 3716 break; 3717 } 3718 } 3719 3720 /* Soundfold the bad word with the default sound folding, so that we don't 3721 * have to do this many times. */ 3722 if (su->su_sallang != NULL) 3723 spell_soundfold(su->su_sallang, su->su_fbadword, TRUE, 3724 su->su_sal_badword); 3725 3726 /* If the word is not capitalised and spell_check() doesn't consider the 3727 * word to be bad then it might need to be capitalised. Add a suggestion 3728 * for that. */ 3729 c = PTR2CHAR(su->su_badptr); 3730 if (!SPELL_ISUPPER(c) && attr == HLF_COUNT) 3731 { 3732 make_case_word(su->su_badword, buf, WF_ONECAP); 3733 add_suggestion(su, &su->su_ga, buf, su->su_badlen, SCORE_ICASE, 3734 0, TRUE, su->su_sallang, FALSE); 3735 } 3736 3737 /* Ban the bad word itself. It may appear in another region. */ 3738 if (banbadword) 3739 add_banned(su, su->su_badword); 3740 3741 /* Make a copy of 'spellsuggest', because the expression may change it. */ 3742 sps_copy = vim_strsave(p_sps); 3743 if (sps_copy == NULL) 3744 return; 3745 3746 /* Loop over the items in 'spellsuggest'. */ 3747 for (p = sps_copy; *p != NUL; ) 3748 { 3749 copy_option_part(&p, buf, MAXPATHL, ","); 3750 3751 if (STRNCMP(buf, "expr:", 5) == 0) 3752 { 3753 #ifdef FEAT_EVAL 3754 /* Evaluate an expression. Skip this when called recursively, 3755 * when using spellsuggest() in the expression. */ 3756 if (!expr_busy) 3757 { 3758 expr_busy = TRUE; 3759 spell_suggest_expr(su, buf + 5); 3760 expr_busy = FALSE; 3761 } 3762 #endif 3763 } 3764 else if (STRNCMP(buf, "file:", 5) == 0) 3765 /* Use list of suggestions in a file. */ 3766 spell_suggest_file(su, buf + 5); 3767 else 3768 { 3769 /* Use internal method. */ 3770 spell_suggest_intern(su, interactive); 3771 if (sps_flags & SPS_DOUBLE) 3772 do_combine = TRUE; 3773 } 3774 } 3775 3776 vim_free(sps_copy); 3777 3778 if (do_combine) 3779 /* Combine the two list of suggestions. This must be done last, 3780 * because sorting changes the order again. */ 3781 score_combine(su); 3782 } 3783 3784 #ifdef FEAT_EVAL 3785 /* 3786 * Find suggestions by evaluating expression "expr". 3787 */ 3788 static void 3789 spell_suggest_expr(suginfo_T *su, char_u *expr) 3790 { 3791 list_T *list; 3792 listitem_T *li; 3793 int score; 3794 char_u *p; 3795 3796 /* The work is split up in a few parts to avoid having to export 3797 * suginfo_T. 3798 * First evaluate the expression and get the resulting list. */ 3799 list = eval_spell_expr(su->su_badword, expr); 3800 if (list != NULL) 3801 { 3802 /* Loop over the items in the list. */ 3803 for (li = list->lv_first; li != NULL; li = li->li_next) 3804 if (li->li_tv.v_type == VAR_LIST) 3805 { 3806 /* Get the word and the score from the items. */ 3807 score = get_spellword(li->li_tv.vval.v_list, &p); 3808 if (score >= 0 && score <= su->su_maxscore) 3809 add_suggestion(su, &su->su_ga, p, su->su_badlen, 3810 score, 0, TRUE, su->su_sallang, FALSE); 3811 } 3812 list_unref(list); 3813 } 3814 3815 /* Remove bogus suggestions, sort and truncate at "maxcount". */ 3816 check_suggestions(su, &su->su_ga); 3817 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); 3818 } 3819 #endif 3820 3821 /* 3822 * Find suggestions in file "fname". Used for "file:" in 'spellsuggest'. 3823 */ 3824 static void 3825 spell_suggest_file(suginfo_T *su, char_u *fname) 3826 { 3827 FILE *fd; 3828 char_u line[MAXWLEN * 2]; 3829 char_u *p; 3830 int len; 3831 char_u cword[MAXWLEN]; 3832 3833 /* Open the file. */ 3834 fd = mch_fopen((char *)fname, "r"); 3835 if (fd == NULL) 3836 { 3837 semsg(_(e_notopen), fname); 3838 return; 3839 } 3840 3841 /* Read it line by line. */ 3842 while (!vim_fgets(line, MAXWLEN * 2, fd) && !got_int) 3843 { 3844 line_breakcheck(); 3845 3846 p = vim_strchr(line, '/'); 3847 if (p == NULL) 3848 continue; /* No Tab found, just skip the line. */ 3849 *p++ = NUL; 3850 if (STRICMP(su->su_badword, line) == 0) 3851 { 3852 /* Match! Isolate the good word, until CR or NL. */ 3853 for (len = 0; p[len] >= ' '; ++len) 3854 ; 3855 p[len] = NUL; 3856 3857 /* If the suggestion doesn't have specific case duplicate the case 3858 * of the bad word. */ 3859 if (captype(p, NULL) == 0) 3860 { 3861 make_case_word(p, cword, su->su_badflags); 3862 p = cword; 3863 } 3864 3865 add_suggestion(su, &su->su_ga, p, su->su_badlen, 3866 SCORE_FILE, 0, TRUE, su->su_sallang, FALSE); 3867 } 3868 } 3869 3870 fclose(fd); 3871 3872 /* Remove bogus suggestions, sort and truncate at "maxcount". */ 3873 check_suggestions(su, &su->su_ga); 3874 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); 3875 } 3876 3877 /* 3878 * Find suggestions for the internal method indicated by "sps_flags". 3879 */ 3880 static void 3881 spell_suggest_intern(suginfo_T *su, int interactive) 3882 { 3883 /* 3884 * Load the .sug file(s) that are available and not done yet. 3885 */ 3886 suggest_load_files(); 3887 3888 /* 3889 * 1. Try special cases, such as repeating a word: "the the" -> "the". 3890 * 3891 * Set a maximum score to limit the combination of operations that is 3892 * tried. 3893 */ 3894 suggest_try_special(su); 3895 3896 /* 3897 * 2. Try inserting/deleting/swapping/changing a letter, use REP entries 3898 * from the .aff file and inserting a space (split the word). 3899 */ 3900 suggest_try_change(su); 3901 3902 /* For the resulting top-scorers compute the sound-a-like score. */ 3903 if (sps_flags & SPS_DOUBLE) 3904 score_comp_sal(su); 3905 3906 /* 3907 * 3. Try finding sound-a-like words. 3908 */ 3909 if ((sps_flags & SPS_FAST) == 0) 3910 { 3911 if (sps_flags & SPS_BEST) 3912 /* Adjust the word score for the suggestions found so far for how 3913 * they sounds like. */ 3914 rescore_suggestions(su); 3915 3916 /* 3917 * While going through the soundfold tree "su_maxscore" is the score 3918 * for the soundfold word, limits the changes that are being tried, 3919 * and "su_sfmaxscore" the rescored score, which is set by 3920 * cleanup_suggestions(). 3921 * First find words with a small edit distance, because this is much 3922 * faster and often already finds the top-N suggestions. If we didn't 3923 * find many suggestions try again with a higher edit distance. 3924 * "sl_sounddone" is used to avoid doing the same word twice. 3925 */ 3926 suggest_try_soundalike_prep(); 3927 su->su_maxscore = SCORE_SFMAX1; 3928 su->su_sfmaxscore = SCORE_MAXINIT * 3; 3929 suggest_try_soundalike(su); 3930 if (su->su_ga.ga_len < SUG_CLEAN_COUNT(su)) 3931 { 3932 /* We didn't find enough matches, try again, allowing more 3933 * changes to the soundfold word. */ 3934 su->su_maxscore = SCORE_SFMAX2; 3935 suggest_try_soundalike(su); 3936 if (su->su_ga.ga_len < SUG_CLEAN_COUNT(su)) 3937 { 3938 /* Still didn't find enough matches, try again, allowing even 3939 * more changes to the soundfold word. */ 3940 su->su_maxscore = SCORE_SFMAX3; 3941 suggest_try_soundalike(su); 3942 } 3943 } 3944 su->su_maxscore = su->su_sfmaxscore; 3945 suggest_try_soundalike_finish(); 3946 } 3947 3948 /* When CTRL-C was hit while searching do show the results. Only clear 3949 * got_int when using a command, not for spellsuggest(). */ 3950 ui_breakcheck(); 3951 if (interactive && got_int) 3952 { 3953 (void)vgetc(); 3954 got_int = FALSE; 3955 } 3956 3957 if ((sps_flags & SPS_DOUBLE) == 0 && su->su_ga.ga_len != 0) 3958 { 3959 if (sps_flags & SPS_BEST) 3960 /* Adjust the word score for how it sounds like. */ 3961 rescore_suggestions(su); 3962 3963 /* Remove bogus suggestions, sort and truncate at "maxcount". */ 3964 check_suggestions(su, &su->su_ga); 3965 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); 3966 } 3967 } 3968 3969 /* 3970 * Free the info put in "*su" by spell_find_suggest(). 3971 */ 3972 static void 3973 spell_find_cleanup(suginfo_T *su) 3974 { 3975 int i; 3976 3977 /* Free the suggestions. */ 3978 for (i = 0; i < su->su_ga.ga_len; ++i) 3979 vim_free(SUG(su->su_ga, i).st_word); 3980 ga_clear(&su->su_ga); 3981 for (i = 0; i < su->su_sga.ga_len; ++i) 3982 vim_free(SUG(su->su_sga, i).st_word); 3983 ga_clear(&su->su_sga); 3984 3985 /* Free the banned words. */ 3986 hash_clear_all(&su->su_banned, 0); 3987 } 3988 3989 /* 3990 * Make a copy of "word", with the first letter upper or lower cased, to 3991 * "wcopy[MAXWLEN]". "word" must not be empty. 3992 * The result is NUL terminated. 3993 */ 3994 void 3995 onecap_copy( 3996 char_u *word, 3997 char_u *wcopy, 3998 int upper) /* TRUE: first letter made upper case */ 3999 { 4000 char_u *p; 4001 int c; 4002 int l; 4003 4004 p = word; 4005 if (has_mbyte) 4006 c = mb_cptr2char_adv(&p); 4007 else 4008 c = *p++; 4009 if (upper) 4010 c = SPELL_TOUPPER(c); 4011 else 4012 c = SPELL_TOFOLD(c); 4013 if (has_mbyte) 4014 l = mb_char2bytes(c, wcopy); 4015 else 4016 { 4017 l = 1; 4018 wcopy[0] = c; 4019 } 4020 vim_strncpy(wcopy + l, p, MAXWLEN - l - 1); 4021 } 4022 4023 /* 4024 * Make a copy of "word" with all the letters upper cased into 4025 * "wcopy[MAXWLEN]". The result is NUL terminated. 4026 */ 4027 static void 4028 allcap_copy(char_u *word, char_u *wcopy) 4029 { 4030 char_u *s; 4031 char_u *d; 4032 int c; 4033 4034 d = wcopy; 4035 for (s = word; *s != NUL; ) 4036 { 4037 if (has_mbyte) 4038 c = mb_cptr2char_adv(&s); 4039 else 4040 c = *s++; 4041 4042 /* We only change 0xdf to SS when we are certain latin1 is used. It 4043 * would cause weird errors in other 8-bit encodings. */ 4044 if (enc_latin1like && c == 0xdf) 4045 { 4046 c = 'S'; 4047 if (d - wcopy >= MAXWLEN - 1) 4048 break; 4049 *d++ = c; 4050 } 4051 else 4052 c = SPELL_TOUPPER(c); 4053 4054 if (has_mbyte) 4055 { 4056 if (d - wcopy >= MAXWLEN - MB_MAXBYTES) 4057 break; 4058 d += mb_char2bytes(c, d); 4059 } 4060 else 4061 { 4062 if (d - wcopy >= MAXWLEN - 1) 4063 break; 4064 *d++ = c; 4065 } 4066 } 4067 *d = NUL; 4068 } 4069 4070 /* 4071 * Try finding suggestions by recognizing specific situations. 4072 */ 4073 static void 4074 suggest_try_special(suginfo_T *su) 4075 { 4076 char_u *p; 4077 size_t len; 4078 int c; 4079 char_u word[MAXWLEN]; 4080 4081 /* 4082 * Recognize a word that is repeated: "the the". 4083 */ 4084 p = skiptowhite(su->su_fbadword); 4085 len = p - su->su_fbadword; 4086 p = skipwhite(p); 4087 if (STRLEN(p) == len && STRNCMP(su->su_fbadword, p, len) == 0) 4088 { 4089 /* Include badflags: if the badword is onecap or allcap 4090 * use that for the goodword too: "The the" -> "The". */ 4091 c = su->su_fbadword[len]; 4092 su->su_fbadword[len] = NUL; 4093 make_case_word(su->su_fbadword, word, su->su_badflags); 4094 su->su_fbadword[len] = c; 4095 4096 /* Give a soundalike score of 0, compute the score as if deleting one 4097 * character. */ 4098 add_suggestion(su, &su->su_ga, word, su->su_badlen, 4099 RESCORE(SCORE_REP, 0), 0, TRUE, su->su_sallang, FALSE); 4100 } 4101 } 4102 4103 /* 4104 * Change the 0 to 1 to measure how much time is spent in each state. 4105 * Output is dumped in "suggestprof". 4106 */ 4107 #if 0 4108 # define SUGGEST_PROFILE 4109 proftime_T current; 4110 proftime_T total; 4111 proftime_T times[STATE_FINAL + 1]; 4112 long counts[STATE_FINAL + 1]; 4113 4114 static void 4115 prof_init(void) 4116 { 4117 for (int i = 0; i <= STATE_FINAL; ++i) 4118 { 4119 profile_zero(×[i]); 4120 counts[i] = 0; 4121 } 4122 profile_start(¤t); 4123 profile_start(&total); 4124 } 4125 4126 /* call before changing state */ 4127 static void 4128 prof_store(state_T state) 4129 { 4130 profile_end(¤t); 4131 profile_add(×[state], ¤t); 4132 ++counts[state]; 4133 profile_start(¤t); 4134 } 4135 # define PROF_STORE(state) prof_store(state); 4136 4137 static void 4138 prof_report(char *name) 4139 { 4140 FILE *fd = fopen("suggestprof", "a"); 4141 4142 profile_end(&total); 4143 fprintf(fd, "-----------------------\n"); 4144 fprintf(fd, "%s: %s\n", name, profile_msg(&total)); 4145 for (int i = 0; i <= STATE_FINAL; ++i) 4146 fprintf(fd, "%d: %s (%ld)\n", i, profile_msg(×[i]), counts[i]); 4147 fclose(fd); 4148 } 4149 #else 4150 # define PROF_STORE(state) 4151 #endif 4152 4153 /* 4154 * Try finding suggestions by adding/removing/swapping letters. 4155 */ 4156 static void 4157 suggest_try_change(suginfo_T *su) 4158 { 4159 char_u fword[MAXWLEN]; /* copy of the bad word, case-folded */ 4160 int n; 4161 char_u *p; 4162 int lpi; 4163 langp_T *lp; 4164 4165 /* We make a copy of the case-folded bad word, so that we can modify it 4166 * to find matches (esp. REP items). Append some more text, changing 4167 * chars after the bad word may help. */ 4168 STRCPY(fword, su->su_fbadword); 4169 n = (int)STRLEN(fword); 4170 p = su->su_badptr + su->su_badlen; 4171 (void)spell_casefold(p, (int)STRLEN(p), fword + n, MAXWLEN - n); 4172 4173 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 4174 { 4175 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 4176 4177 /* If reloading a spell file fails it's still in the list but 4178 * everything has been cleared. */ 4179 if (lp->lp_slang->sl_fbyts == NULL) 4180 continue; 4181 4182 /* Try it for this language. Will add possible suggestions. */ 4183 #ifdef SUGGEST_PROFILE 4184 prof_init(); 4185 #endif 4186 suggest_trie_walk(su, lp, fword, FALSE); 4187 #ifdef SUGGEST_PROFILE 4188 prof_report("try_change"); 4189 #endif 4190 } 4191 } 4192 4193 /* Check the maximum score, if we go over it we won't try this change. */ 4194 #define TRY_DEEPER(su, stack, depth, add) \ 4195 (stack[depth].ts_score + (add) < su->su_maxscore) 4196 4197 /* 4198 * Try finding suggestions by adding/removing/swapping letters. 4199 * 4200 * This uses a state machine. At each node in the tree we try various 4201 * operations. When trying if an operation works "depth" is increased and the 4202 * stack[] is used to store info. This allows combinations, thus insert one 4203 * character, replace one and delete another. The number of changes is 4204 * limited by su->su_maxscore. 4205 * 4206 * After implementing this I noticed an article by Kemal Oflazer that 4207 * describes something similar: "Error-tolerant Finite State Recognition with 4208 * Applications to Morphological Analysis and Spelling Correction" (1996). 4209 * The implementation in the article is simplified and requires a stack of 4210 * unknown depth. The implementation here only needs a stack depth equal to 4211 * the length of the word. 4212 * 4213 * This is also used for the sound-folded word, "soundfold" is TRUE then. 4214 * The mechanism is the same, but we find a match with a sound-folded word 4215 * that comes from one or more original words. Each of these words may be 4216 * added, this is done by add_sound_suggest(). 4217 * Don't use: 4218 * the prefix tree or the keep-case tree 4219 * "su->su_badlen" 4220 * anything to do with upper and lower case 4221 * anything to do with word or non-word characters ("spell_iswordp()") 4222 * banned words 4223 * word flags (rare, region, compounding) 4224 * word splitting for now 4225 * "similar_chars()" 4226 * use "slang->sl_repsal" instead of "lp->lp_replang->sl_rep" 4227 */ 4228 static void 4229 suggest_trie_walk( 4230 suginfo_T *su, 4231 langp_T *lp, 4232 char_u *fword, 4233 int soundfold) 4234 { 4235 char_u tword[MAXWLEN]; /* good word collected so far */ 4236 trystate_T stack[MAXWLEN]; 4237 char_u preword[MAXWLEN * 3]; /* word found with proper case; 4238 * concatenation of prefix compound 4239 * words and split word. NUL terminated 4240 * when going deeper but not when coming 4241 * back. */ 4242 char_u compflags[MAXWLEN]; /* compound flags, one for each word */ 4243 trystate_T *sp; 4244 int newscore; 4245 int score; 4246 char_u *byts, *fbyts, *pbyts; 4247 idx_T *idxs, *fidxs, *pidxs; 4248 int depth; 4249 int c, c2, c3; 4250 int n = 0; 4251 int flags; 4252 garray_T *gap; 4253 idx_T arridx; 4254 int len; 4255 char_u *p; 4256 fromto_T *ftp; 4257 int fl = 0, tl; 4258 int repextra = 0; /* extra bytes in fword[] from REP item */ 4259 slang_T *slang = lp->lp_slang; 4260 int fword_ends; 4261 int goodword_ends; 4262 #ifdef DEBUG_TRIEWALK 4263 /* Stores the name of the change made at each level. */ 4264 char_u changename[MAXWLEN][80]; 4265 #endif 4266 int breakcheckcount = 1000; 4267 int compound_ok; 4268 4269 /* 4270 * Go through the whole case-fold tree, try changes at each node. 4271 * "tword[]" contains the word collected from nodes in the tree. 4272 * "fword[]" the word we are trying to match with (initially the bad 4273 * word). 4274 */ 4275 depth = 0; 4276 sp = &stack[0]; 4277 vim_memset(sp, 0, sizeof(trystate_T)); 4278 sp->ts_curi = 1; 4279 4280 if (soundfold) 4281 { 4282 /* Going through the soundfold tree. */ 4283 byts = fbyts = slang->sl_sbyts; 4284 idxs = fidxs = slang->sl_sidxs; 4285 pbyts = NULL; 4286 pidxs = NULL; 4287 sp->ts_prefixdepth = PFD_NOPREFIX; 4288 sp->ts_state = STATE_START; 4289 } 4290 else 4291 { 4292 /* 4293 * When there are postponed prefixes we need to use these first. At 4294 * the end of the prefix we continue in the case-fold tree. 4295 */ 4296 fbyts = slang->sl_fbyts; 4297 fidxs = slang->sl_fidxs; 4298 pbyts = slang->sl_pbyts; 4299 pidxs = slang->sl_pidxs; 4300 if (pbyts != NULL) 4301 { 4302 byts = pbyts; 4303 idxs = pidxs; 4304 sp->ts_prefixdepth = PFD_PREFIXTREE; 4305 sp->ts_state = STATE_NOPREFIX; /* try without prefix first */ 4306 } 4307 else 4308 { 4309 byts = fbyts; 4310 idxs = fidxs; 4311 sp->ts_prefixdepth = PFD_NOPREFIX; 4312 sp->ts_state = STATE_START; 4313 } 4314 } 4315 4316 /* 4317 * Loop to find all suggestions. At each round we either: 4318 * - For the current state try one operation, advance "ts_curi", 4319 * increase "depth". 4320 * - When a state is done go to the next, set "ts_state". 4321 * - When all states are tried decrease "depth". 4322 */ 4323 while (depth >= 0 && !got_int) 4324 { 4325 sp = &stack[depth]; 4326 switch (sp->ts_state) 4327 { 4328 case STATE_START: 4329 case STATE_NOPREFIX: 4330 /* 4331 * Start of node: Deal with NUL bytes, which means 4332 * tword[] may end here. 4333 */ 4334 arridx = sp->ts_arridx; /* current node in the tree */ 4335 len = byts[arridx]; /* bytes in this node */ 4336 arridx += sp->ts_curi; /* index of current byte */ 4337 4338 if (sp->ts_prefixdepth == PFD_PREFIXTREE) 4339 { 4340 /* Skip over the NUL bytes, we use them later. */ 4341 for (n = 0; n < len && byts[arridx + n] == 0; ++n) 4342 ; 4343 sp->ts_curi += n; 4344 4345 /* Always past NUL bytes now. */ 4346 n = (int)sp->ts_state; 4347 PROF_STORE(sp->ts_state) 4348 sp->ts_state = STATE_ENDNUL; 4349 sp->ts_save_badflags = su->su_badflags; 4350 4351 /* At end of a prefix or at start of prefixtree: check for 4352 * following word. */ 4353 if (byts[arridx] == 0 || n == (int)STATE_NOPREFIX) 4354 { 4355 /* Set su->su_badflags to the caps type at this position. 4356 * Use the caps type until here for the prefix itself. */ 4357 if (has_mbyte) 4358 n = nofold_len(fword, sp->ts_fidx, su->su_badptr); 4359 else 4360 n = sp->ts_fidx; 4361 flags = badword_captype(su->su_badptr, su->su_badptr + n); 4362 su->su_badflags = badword_captype(su->su_badptr + n, 4363 su->su_badptr + su->su_badlen); 4364 #ifdef DEBUG_TRIEWALK 4365 sprintf(changename[depth], "prefix"); 4366 #endif 4367 go_deeper(stack, depth, 0); 4368 ++depth; 4369 sp = &stack[depth]; 4370 sp->ts_prefixdepth = depth - 1; 4371 byts = fbyts; 4372 idxs = fidxs; 4373 sp->ts_arridx = 0; 4374 4375 /* Move the prefix to preword[] with the right case 4376 * and make find_keepcap_word() works. */ 4377 tword[sp->ts_twordlen] = NUL; 4378 make_case_word(tword + sp->ts_splitoff, 4379 preword + sp->ts_prewordlen, flags); 4380 sp->ts_prewordlen = (char_u)STRLEN(preword); 4381 sp->ts_splitoff = sp->ts_twordlen; 4382 } 4383 break; 4384 } 4385 4386 if (sp->ts_curi > len || byts[arridx] != 0) 4387 { 4388 /* Past bytes in node and/or past NUL bytes. */ 4389 PROF_STORE(sp->ts_state) 4390 sp->ts_state = STATE_ENDNUL; 4391 sp->ts_save_badflags = su->su_badflags; 4392 break; 4393 } 4394 4395 /* 4396 * End of word in tree. 4397 */ 4398 ++sp->ts_curi; /* eat one NUL byte */ 4399 4400 flags = (int)idxs[arridx]; 4401 4402 /* Skip words with the NOSUGGEST flag. */ 4403 if (flags & WF_NOSUGGEST) 4404 break; 4405 4406 fword_ends = (fword[sp->ts_fidx] == NUL 4407 || (soundfold 4408 ? VIM_ISWHITE(fword[sp->ts_fidx]) 4409 : !spell_iswordp(fword + sp->ts_fidx, curwin))); 4410 tword[sp->ts_twordlen] = NUL; 4411 4412 if (sp->ts_prefixdepth <= PFD_NOTSPECIAL 4413 && (sp->ts_flags & TSF_PREFIXOK) == 0) 4414 { 4415 /* There was a prefix before the word. Check that the prefix 4416 * can be used with this word. */ 4417 /* Count the length of the NULs in the prefix. If there are 4418 * none this must be the first try without a prefix. */ 4419 n = stack[sp->ts_prefixdepth].ts_arridx; 4420 len = pbyts[n++]; 4421 for (c = 0; c < len && pbyts[n + c] == 0; ++c) 4422 ; 4423 if (c > 0) 4424 { 4425 c = valid_word_prefix(c, n, flags, 4426 tword + sp->ts_splitoff, slang, FALSE); 4427 if (c == 0) 4428 break; 4429 4430 /* Use the WF_RARE flag for a rare prefix. */ 4431 if (c & WF_RAREPFX) 4432 flags |= WF_RARE; 4433 4434 /* Tricky: when checking for both prefix and compounding 4435 * we run into the prefix flag first. 4436 * Remember that it's OK, so that we accept the prefix 4437 * when arriving at a compound flag. */ 4438 sp->ts_flags |= TSF_PREFIXOK; 4439 } 4440 } 4441 4442 /* Check NEEDCOMPOUND: can't use word without compounding. Do try 4443 * appending another compound word below. */ 4444 if (sp->ts_complen == sp->ts_compsplit && fword_ends 4445 && (flags & WF_NEEDCOMP)) 4446 goodword_ends = FALSE; 4447 else 4448 goodword_ends = TRUE; 4449 4450 p = NULL; 4451 compound_ok = TRUE; 4452 if (sp->ts_complen > sp->ts_compsplit) 4453 { 4454 if (slang->sl_nobreak) 4455 { 4456 /* There was a word before this word. When there was no 4457 * change in this word (it was correct) add the first word 4458 * as a suggestion. If this word was corrected too, we 4459 * need to check if a correct word follows. */ 4460 if (sp->ts_fidx - sp->ts_splitfidx 4461 == sp->ts_twordlen - sp->ts_splitoff 4462 && STRNCMP(fword + sp->ts_splitfidx, 4463 tword + sp->ts_splitoff, 4464 sp->ts_fidx - sp->ts_splitfidx) == 0) 4465 { 4466 preword[sp->ts_prewordlen] = NUL; 4467 newscore = score_wordcount_adj(slang, sp->ts_score, 4468 preword + sp->ts_prewordlen, 4469 sp->ts_prewordlen > 0); 4470 /* Add the suggestion if the score isn't too bad. */ 4471 if (newscore <= su->su_maxscore) 4472 add_suggestion(su, &su->su_ga, preword, 4473 sp->ts_splitfidx - repextra, 4474 newscore, 0, FALSE, 4475 lp->lp_sallang, FALSE); 4476 break; 4477 } 4478 } 4479 else 4480 { 4481 /* There was a compound word before this word. If this 4482 * word does not support compounding then give up 4483 * (splitting is tried for the word without compound 4484 * flag). */ 4485 if (((unsigned)flags >> 24) == 0 4486 || sp->ts_twordlen - sp->ts_splitoff 4487 < slang->sl_compminlen) 4488 break; 4489 /* For multi-byte chars check character length against 4490 * COMPOUNDMIN. */ 4491 if (has_mbyte 4492 && slang->sl_compminlen > 0 4493 && mb_charlen(tword + sp->ts_splitoff) 4494 < slang->sl_compminlen) 4495 break; 4496 4497 compflags[sp->ts_complen] = ((unsigned)flags >> 24); 4498 compflags[sp->ts_complen + 1] = NUL; 4499 vim_strncpy(preword + sp->ts_prewordlen, 4500 tword + sp->ts_splitoff, 4501 sp->ts_twordlen - sp->ts_splitoff); 4502 4503 /* Verify CHECKCOMPOUNDPATTERN rules. */ 4504 if (match_checkcompoundpattern(preword, sp->ts_prewordlen, 4505 &slang->sl_comppat)) 4506 compound_ok = FALSE; 4507 4508 if (compound_ok) 4509 { 4510 p = preword; 4511 while (*skiptowhite(p) != NUL) 4512 p = skipwhite(skiptowhite(p)); 4513 if (fword_ends && !can_compound(slang, p, 4514 compflags + sp->ts_compsplit)) 4515 /* Compound is not allowed. But it may still be 4516 * possible if we add another (short) word. */ 4517 compound_ok = FALSE; 4518 } 4519 4520 /* Get pointer to last char of previous word. */ 4521 p = preword + sp->ts_prewordlen; 4522 MB_PTR_BACK(preword, p); 4523 } 4524 } 4525 4526 /* 4527 * Form the word with proper case in preword. 4528 * If there is a word from a previous split, append. 4529 * For the soundfold tree don't change the case, simply append. 4530 */ 4531 if (soundfold) 4532 STRCPY(preword + sp->ts_prewordlen, tword + sp->ts_splitoff); 4533 else if (flags & WF_KEEPCAP) 4534 /* Must find the word in the keep-case tree. */ 4535 find_keepcap_word(slang, tword + sp->ts_splitoff, 4536 preword + sp->ts_prewordlen); 4537 else 4538 { 4539 /* Include badflags: If the badword is onecap or allcap 4540 * use that for the goodword too. But if the badword is 4541 * allcap and it's only one char long use onecap. */ 4542 c = su->su_badflags; 4543 if ((c & WF_ALLCAP) 4544 && su->su_badlen == (*mb_ptr2len)(su->su_badptr)) 4545 c = WF_ONECAP; 4546 c |= flags; 4547 4548 /* When appending a compound word after a word character don't 4549 * use Onecap. */ 4550 if (p != NULL && spell_iswordp_nmw(p, curwin)) 4551 c &= ~WF_ONECAP; 4552 make_case_word(tword + sp->ts_splitoff, 4553 preword + sp->ts_prewordlen, c); 4554 } 4555 4556 if (!soundfold) 4557 { 4558 /* Don't use a banned word. It may appear again as a good 4559 * word, thus remember it. */ 4560 if (flags & WF_BANNED) 4561 { 4562 add_banned(su, preword + sp->ts_prewordlen); 4563 break; 4564 } 4565 if ((sp->ts_complen == sp->ts_compsplit 4566 && WAS_BANNED(su, preword + sp->ts_prewordlen)) 4567 || WAS_BANNED(su, preword)) 4568 { 4569 if (slang->sl_compprog == NULL) 4570 break; 4571 /* the word so far was banned but we may try compounding */ 4572 goodword_ends = FALSE; 4573 } 4574 } 4575 4576 newscore = 0; 4577 if (!soundfold) /* soundfold words don't have flags */ 4578 { 4579 if ((flags & WF_REGION) 4580 && (((unsigned)flags >> 16) & lp->lp_region) == 0) 4581 newscore += SCORE_REGION; 4582 if (flags & WF_RARE) 4583 newscore += SCORE_RARE; 4584 4585 if (!spell_valid_case(su->su_badflags, 4586 captype(preword + sp->ts_prewordlen, NULL))) 4587 newscore += SCORE_ICASE; 4588 } 4589 4590 /* TODO: how about splitting in the soundfold tree? */ 4591 if (fword_ends 4592 && goodword_ends 4593 && sp->ts_fidx >= sp->ts_fidxtry 4594 && compound_ok) 4595 { 4596 /* The badword also ends: add suggestions. */ 4597 #ifdef DEBUG_TRIEWALK 4598 if (soundfold && STRCMP(preword, "smwrd") == 0) 4599 { 4600 int j; 4601 4602 /* print the stack of changes that brought us here */ 4603 smsg("------ %s -------", fword); 4604 for (j = 0; j < depth; ++j) 4605 smsg("%s", changename[j]); 4606 } 4607 #endif 4608 if (soundfold) 4609 { 4610 /* For soundfolded words we need to find the original 4611 * words, the edit distance and then add them. */ 4612 add_sound_suggest(su, preword, sp->ts_score, lp); 4613 } 4614 else if (sp->ts_fidx > 0) 4615 { 4616 /* Give a penalty when changing non-word char to word 4617 * char, e.g., "thes," -> "these". */ 4618 p = fword + sp->ts_fidx; 4619 MB_PTR_BACK(fword, p); 4620 if (!spell_iswordp(p, curwin)) 4621 { 4622 p = preword + STRLEN(preword); 4623 MB_PTR_BACK(preword, p); 4624 if (spell_iswordp(p, curwin)) 4625 newscore += SCORE_NONWORD; 4626 } 4627 4628 /* Give a bonus to words seen before. */ 4629 score = score_wordcount_adj(slang, 4630 sp->ts_score + newscore, 4631 preword + sp->ts_prewordlen, 4632 sp->ts_prewordlen > 0); 4633 4634 /* Add the suggestion if the score isn't too bad. */ 4635 if (score <= su->su_maxscore) 4636 { 4637 add_suggestion(su, &su->su_ga, preword, 4638 sp->ts_fidx - repextra, 4639 score, 0, FALSE, lp->lp_sallang, FALSE); 4640 4641 if (su->su_badflags & WF_MIXCAP) 4642 { 4643 /* We really don't know if the word should be 4644 * upper or lower case, add both. */ 4645 c = captype(preword, NULL); 4646 if (c == 0 || c == WF_ALLCAP) 4647 { 4648 make_case_word(tword + sp->ts_splitoff, 4649 preword + sp->ts_prewordlen, 4650 c == 0 ? WF_ALLCAP : 0); 4651 4652 add_suggestion(su, &su->su_ga, preword, 4653 sp->ts_fidx - repextra, 4654 score + SCORE_ICASE, 0, FALSE, 4655 lp->lp_sallang, FALSE); 4656 } 4657 } 4658 } 4659 } 4660 } 4661 4662 /* 4663 * Try word split and/or compounding. 4664 */ 4665 if ((sp->ts_fidx >= sp->ts_fidxtry || fword_ends) 4666 /* Don't split halfway a character. */ 4667 && (!has_mbyte || sp->ts_tcharlen == 0)) 4668 { 4669 int try_compound; 4670 int try_split; 4671 4672 /* If past the end of the bad word don't try a split. 4673 * Otherwise try changing the next word. E.g., find 4674 * suggestions for "the the" where the second "the" is 4675 * different. It's done like a split. 4676 * TODO: word split for soundfold words */ 4677 try_split = (sp->ts_fidx - repextra < su->su_badlen) 4678 && !soundfold; 4679 4680 /* Get here in several situations: 4681 * 1. The word in the tree ends: 4682 * If the word allows compounding try that. Otherwise try 4683 * a split by inserting a space. For both check that a 4684 * valid words starts at fword[sp->ts_fidx]. 4685 * For NOBREAK do like compounding to be able to check if 4686 * the next word is valid. 4687 * 2. The badword does end, but it was due to a change (e.g., 4688 * a swap). No need to split, but do check that the 4689 * following word is valid. 4690 * 3. The badword and the word in the tree end. It may still 4691 * be possible to compound another (short) word. 4692 */ 4693 try_compound = FALSE; 4694 if (!soundfold 4695 && !slang->sl_nocompoundsugs 4696 && slang->sl_compprog != NULL 4697 && ((unsigned)flags >> 24) != 0 4698 && sp->ts_twordlen - sp->ts_splitoff 4699 >= slang->sl_compminlen 4700 && (!has_mbyte 4701 || slang->sl_compminlen == 0 4702 || mb_charlen(tword + sp->ts_splitoff) 4703 >= slang->sl_compminlen) 4704 && (slang->sl_compsylmax < MAXWLEN 4705 || sp->ts_complen + 1 - sp->ts_compsplit 4706 < slang->sl_compmax) 4707 && (can_be_compound(sp, slang, 4708 compflags, ((unsigned)flags >> 24)))) 4709 4710 { 4711 try_compound = TRUE; 4712 compflags[sp->ts_complen] = ((unsigned)flags >> 24); 4713 compflags[sp->ts_complen + 1] = NUL; 4714 } 4715 4716 /* For NOBREAK we never try splitting, it won't make any word 4717 * valid. */ 4718 if (slang->sl_nobreak && !slang->sl_nocompoundsugs) 4719 try_compound = TRUE; 4720 4721 /* If we could add a compound word, and it's also possible to 4722 * split at this point, do the split first and set 4723 * TSF_DIDSPLIT to avoid doing it again. */ 4724 else if (!fword_ends 4725 && try_compound 4726 && (sp->ts_flags & TSF_DIDSPLIT) == 0) 4727 { 4728 try_compound = FALSE; 4729 sp->ts_flags |= TSF_DIDSPLIT; 4730 --sp->ts_curi; /* do the same NUL again */ 4731 compflags[sp->ts_complen] = NUL; 4732 } 4733 else 4734 sp->ts_flags &= ~TSF_DIDSPLIT; 4735 4736 if (try_split || try_compound) 4737 { 4738 if (!try_compound && (!fword_ends || !goodword_ends)) 4739 { 4740 /* If we're going to split need to check that the 4741 * words so far are valid for compounding. If there 4742 * is only one word it must not have the NEEDCOMPOUND 4743 * flag. */ 4744 if (sp->ts_complen == sp->ts_compsplit 4745 && (flags & WF_NEEDCOMP)) 4746 break; 4747 p = preword; 4748 while (*skiptowhite(p) != NUL) 4749 p = skipwhite(skiptowhite(p)); 4750 if (sp->ts_complen > sp->ts_compsplit 4751 && !can_compound(slang, p, 4752 compflags + sp->ts_compsplit)) 4753 break; 4754 4755 if (slang->sl_nosplitsugs) 4756 newscore += SCORE_SPLIT_NO; 4757 else 4758 newscore += SCORE_SPLIT; 4759 4760 /* Give a bonus to words seen before. */ 4761 newscore = score_wordcount_adj(slang, newscore, 4762 preword + sp->ts_prewordlen, TRUE); 4763 } 4764 4765 if (TRY_DEEPER(su, stack, depth, newscore)) 4766 { 4767 go_deeper(stack, depth, newscore); 4768 #ifdef DEBUG_TRIEWALK 4769 if (!try_compound && !fword_ends) 4770 sprintf(changename[depth], "%.*s-%s: split", 4771 sp->ts_twordlen, tword, fword + sp->ts_fidx); 4772 else 4773 sprintf(changename[depth], "%.*s-%s: compound", 4774 sp->ts_twordlen, tword, fword + sp->ts_fidx); 4775 #endif 4776 /* Save things to be restored at STATE_SPLITUNDO. */ 4777 sp->ts_save_badflags = su->su_badflags; 4778 PROF_STORE(sp->ts_state) 4779 sp->ts_state = STATE_SPLITUNDO; 4780 4781 ++depth; 4782 sp = &stack[depth]; 4783 4784 /* Append a space to preword when splitting. */ 4785 if (!try_compound && !fword_ends) 4786 STRCAT(preword, " "); 4787 sp->ts_prewordlen = (char_u)STRLEN(preword); 4788 sp->ts_splitoff = sp->ts_twordlen; 4789 sp->ts_splitfidx = sp->ts_fidx; 4790 4791 /* If the badword has a non-word character at this 4792 * position skip it. That means replacing the 4793 * non-word character with a space. Always skip a 4794 * character when the word ends. But only when the 4795 * good word can end. */ 4796 if (((!try_compound && !spell_iswordp_nmw(fword 4797 + sp->ts_fidx, 4798 curwin)) 4799 || fword_ends) 4800 && fword[sp->ts_fidx] != NUL 4801 && goodword_ends) 4802 { 4803 int l; 4804 4805 l = MB_PTR2LEN(fword + sp->ts_fidx); 4806 if (fword_ends) 4807 { 4808 /* Copy the skipped character to preword. */ 4809 mch_memmove(preword + sp->ts_prewordlen, 4810 fword + sp->ts_fidx, l); 4811 sp->ts_prewordlen += l; 4812 preword[sp->ts_prewordlen] = NUL; 4813 } 4814 else 4815 sp->ts_score -= SCORE_SPLIT - SCORE_SUBST; 4816 sp->ts_fidx += l; 4817 } 4818 4819 /* When compounding include compound flag in 4820 * compflags[] (already set above). When splitting we 4821 * may start compounding over again. */ 4822 if (try_compound) 4823 ++sp->ts_complen; 4824 else 4825 sp->ts_compsplit = sp->ts_complen; 4826 sp->ts_prefixdepth = PFD_NOPREFIX; 4827 4828 /* set su->su_badflags to the caps type at this 4829 * position */ 4830 if (has_mbyte) 4831 n = nofold_len(fword, sp->ts_fidx, su->su_badptr); 4832 else 4833 n = sp->ts_fidx; 4834 su->su_badflags = badword_captype(su->su_badptr + n, 4835 su->su_badptr + su->su_badlen); 4836 4837 /* Restart at top of the tree. */ 4838 sp->ts_arridx = 0; 4839 4840 /* If there are postponed prefixes, try these too. */ 4841 if (pbyts != NULL) 4842 { 4843 byts = pbyts; 4844 idxs = pidxs; 4845 sp->ts_prefixdepth = PFD_PREFIXTREE; 4846 PROF_STORE(sp->ts_state) 4847 sp->ts_state = STATE_NOPREFIX; 4848 } 4849 } 4850 } 4851 } 4852 break; 4853 4854 case STATE_SPLITUNDO: 4855 /* Undo the changes done for word split or compound word. */ 4856 su->su_badflags = sp->ts_save_badflags; 4857 4858 /* Continue looking for NUL bytes. */ 4859 PROF_STORE(sp->ts_state) 4860 sp->ts_state = STATE_START; 4861 4862 /* In case we went into the prefix tree. */ 4863 byts = fbyts; 4864 idxs = fidxs; 4865 break; 4866 4867 case STATE_ENDNUL: 4868 /* Past the NUL bytes in the node. */ 4869 su->su_badflags = sp->ts_save_badflags; 4870 if (fword[sp->ts_fidx] == NUL && sp->ts_tcharlen == 0) 4871 { 4872 /* The badword ends, can't use STATE_PLAIN. */ 4873 PROF_STORE(sp->ts_state) 4874 sp->ts_state = STATE_DEL; 4875 break; 4876 } 4877 PROF_STORE(sp->ts_state) 4878 sp->ts_state = STATE_PLAIN; 4879 /* FALLTHROUGH */ 4880 4881 case STATE_PLAIN: 4882 /* 4883 * Go over all possible bytes at this node, add each to tword[] 4884 * and use child node. "ts_curi" is the index. 4885 */ 4886 arridx = sp->ts_arridx; 4887 if (sp->ts_curi > byts[arridx]) 4888 { 4889 /* Done all bytes at this node, do next state. When still at 4890 * already changed bytes skip the other tricks. */ 4891 PROF_STORE(sp->ts_state) 4892 if (sp->ts_fidx >= sp->ts_fidxtry) 4893 sp->ts_state = STATE_DEL; 4894 else 4895 sp->ts_state = STATE_FINAL; 4896 } 4897 else 4898 { 4899 arridx += sp->ts_curi++; 4900 c = byts[arridx]; 4901 4902 /* Normal byte, go one level deeper. If it's not equal to the 4903 * byte in the bad word adjust the score. But don't even try 4904 * when the byte was already changed. And don't try when we 4905 * just deleted this byte, accepting it is always cheaper than 4906 * delete + substitute. */ 4907 if (c == fword[sp->ts_fidx] 4908 || (sp->ts_tcharlen > 0 && sp->ts_isdiff != DIFF_NONE)) 4909 newscore = 0; 4910 else 4911 newscore = SCORE_SUBST; 4912 if ((newscore == 0 4913 || (sp->ts_fidx >= sp->ts_fidxtry 4914 && ((sp->ts_flags & TSF_DIDDEL) == 0 4915 || c != fword[sp->ts_delidx]))) 4916 && TRY_DEEPER(su, stack, depth, newscore)) 4917 { 4918 go_deeper(stack, depth, newscore); 4919 #ifdef DEBUG_TRIEWALK 4920 if (newscore > 0) 4921 sprintf(changename[depth], "%.*s-%s: subst %c to %c", 4922 sp->ts_twordlen, tword, fword + sp->ts_fidx, 4923 fword[sp->ts_fidx], c); 4924 else 4925 sprintf(changename[depth], "%.*s-%s: accept %c", 4926 sp->ts_twordlen, tword, fword + sp->ts_fidx, 4927 fword[sp->ts_fidx]); 4928 #endif 4929 ++depth; 4930 sp = &stack[depth]; 4931 ++sp->ts_fidx; 4932 tword[sp->ts_twordlen++] = c; 4933 sp->ts_arridx = idxs[arridx]; 4934 if (newscore == SCORE_SUBST) 4935 sp->ts_isdiff = DIFF_YES; 4936 if (has_mbyte) 4937 { 4938 /* Multi-byte characters are a bit complicated to 4939 * handle: They differ when any of the bytes differ 4940 * and then their length may also differ. */ 4941 if (sp->ts_tcharlen == 0) 4942 { 4943 /* First byte. */ 4944 sp->ts_tcharidx = 0; 4945 sp->ts_tcharlen = MB_BYTE2LEN(c); 4946 sp->ts_fcharstart = sp->ts_fidx - 1; 4947 sp->ts_isdiff = (newscore != 0) 4948 ? DIFF_YES : DIFF_NONE; 4949 } 4950 else if (sp->ts_isdiff == DIFF_INSERT) 4951 /* When inserting trail bytes don't advance in the 4952 * bad word. */ 4953 --sp->ts_fidx; 4954 if (++sp->ts_tcharidx == sp->ts_tcharlen) 4955 { 4956 /* Last byte of character. */ 4957 if (sp->ts_isdiff == DIFF_YES) 4958 { 4959 /* Correct ts_fidx for the byte length of the 4960 * character (we didn't check that before). */ 4961 sp->ts_fidx = sp->ts_fcharstart 4962 + MB_PTR2LEN( 4963 fword + sp->ts_fcharstart); 4964 /* For changing a composing character adjust 4965 * the score from SCORE_SUBST to 4966 * SCORE_SUBCOMP. */ 4967 if (enc_utf8 4968 && utf_iscomposing( 4969 utf_ptr2char(tword 4970 + sp->ts_twordlen 4971 - sp->ts_tcharlen)) 4972 && utf_iscomposing( 4973 utf_ptr2char(fword 4974 + sp->ts_fcharstart))) 4975 sp->ts_score -= 4976 SCORE_SUBST - SCORE_SUBCOMP; 4977 4978 /* For a similar character adjust score from 4979 * SCORE_SUBST to SCORE_SIMILAR. */ 4980 else if (!soundfold 4981 && slang->sl_has_map 4982 && similar_chars(slang, 4983 mb_ptr2char(tword 4984 + sp->ts_twordlen 4985 - sp->ts_tcharlen), 4986 mb_ptr2char(fword 4987 + sp->ts_fcharstart))) 4988 sp->ts_score -= 4989 SCORE_SUBST - SCORE_SIMILAR; 4990 } 4991 else if (sp->ts_isdiff == DIFF_INSERT 4992 && sp->ts_twordlen > sp->ts_tcharlen) 4993 { 4994 p = tword + sp->ts_twordlen - sp->ts_tcharlen; 4995 c = mb_ptr2char(p); 4996 if (enc_utf8 && utf_iscomposing(c)) 4997 { 4998 /* Inserting a composing char doesn't 4999 * count that much. */ 5000 sp->ts_score -= SCORE_INS - SCORE_INSCOMP; 5001 } 5002 else 5003 { 5004 /* If the previous character was the same, 5005 * thus doubling a character, give a bonus 5006 * to the score. Also for the soundfold 5007 * tree (might seem illogical but does 5008 * give better scores). */ 5009 MB_PTR_BACK(tword, p); 5010 if (c == mb_ptr2char(p)) 5011 sp->ts_score -= SCORE_INS 5012 - SCORE_INSDUP; 5013 } 5014 } 5015 5016 /* Starting a new char, reset the length. */ 5017 sp->ts_tcharlen = 0; 5018 } 5019 } 5020 else 5021 { 5022 /* If we found a similar char adjust the score. 5023 * We do this after calling go_deeper() because 5024 * it's slow. */ 5025 if (newscore != 0 5026 && !soundfold 5027 && slang->sl_has_map 5028 && similar_chars(slang, 5029 c, fword[sp->ts_fidx - 1])) 5030 sp->ts_score -= SCORE_SUBST - SCORE_SIMILAR; 5031 } 5032 } 5033 } 5034 break; 5035 5036 case STATE_DEL: 5037 /* When past the first byte of a multi-byte char don't try 5038 * delete/insert/swap a character. */ 5039 if (has_mbyte && sp->ts_tcharlen > 0) 5040 { 5041 PROF_STORE(sp->ts_state) 5042 sp->ts_state = STATE_FINAL; 5043 break; 5044 } 5045 /* 5046 * Try skipping one character in the bad word (delete it). 5047 */ 5048 PROF_STORE(sp->ts_state) 5049 sp->ts_state = STATE_INS_PREP; 5050 sp->ts_curi = 1; 5051 if (soundfold && sp->ts_fidx == 0 && fword[sp->ts_fidx] == '*') 5052 /* Deleting a vowel at the start of a word counts less, see 5053 * soundalike_score(). */ 5054 newscore = 2 * SCORE_DEL / 3; 5055 else 5056 newscore = SCORE_DEL; 5057 if (fword[sp->ts_fidx] != NUL 5058 && TRY_DEEPER(su, stack, depth, newscore)) 5059 { 5060 go_deeper(stack, depth, newscore); 5061 #ifdef DEBUG_TRIEWALK 5062 sprintf(changename[depth], "%.*s-%s: delete %c", 5063 sp->ts_twordlen, tword, fword + sp->ts_fidx, 5064 fword[sp->ts_fidx]); 5065 #endif 5066 ++depth; 5067 5068 /* Remember what character we deleted, so that we can avoid 5069 * inserting it again. */ 5070 stack[depth].ts_flags |= TSF_DIDDEL; 5071 stack[depth].ts_delidx = sp->ts_fidx; 5072 5073 /* Advance over the character in fword[]. Give a bonus to the 5074 * score if the same character is following "nn" -> "n". It's 5075 * a bit illogical for soundfold tree but it does give better 5076 * results. */ 5077 if (has_mbyte) 5078 { 5079 c = mb_ptr2char(fword + sp->ts_fidx); 5080 stack[depth].ts_fidx += MB_PTR2LEN(fword + sp->ts_fidx); 5081 if (enc_utf8 && utf_iscomposing(c)) 5082 stack[depth].ts_score -= SCORE_DEL - SCORE_DELCOMP; 5083 else if (c == mb_ptr2char(fword + stack[depth].ts_fidx)) 5084 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP; 5085 } 5086 else 5087 { 5088 ++stack[depth].ts_fidx; 5089 if (fword[sp->ts_fidx] == fword[sp->ts_fidx + 1]) 5090 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP; 5091 } 5092 break; 5093 } 5094 /* FALLTHROUGH */ 5095 5096 case STATE_INS_PREP: 5097 if (sp->ts_flags & TSF_DIDDEL) 5098 { 5099 /* If we just deleted a byte then inserting won't make sense, 5100 * a substitute is always cheaper. */ 5101 PROF_STORE(sp->ts_state) 5102 sp->ts_state = STATE_SWAP; 5103 break; 5104 } 5105 5106 /* skip over NUL bytes */ 5107 n = sp->ts_arridx; 5108 for (;;) 5109 { 5110 if (sp->ts_curi > byts[n]) 5111 { 5112 /* Only NUL bytes at this node, go to next state. */ 5113 PROF_STORE(sp->ts_state) 5114 sp->ts_state = STATE_SWAP; 5115 break; 5116 } 5117 if (byts[n + sp->ts_curi] != NUL) 5118 { 5119 /* Found a byte to insert. */ 5120 PROF_STORE(sp->ts_state) 5121 sp->ts_state = STATE_INS; 5122 break; 5123 } 5124 ++sp->ts_curi; 5125 } 5126 break; 5127 5128 /* FALLTHROUGH */ 5129 5130 case STATE_INS: 5131 /* Insert one byte. Repeat this for each possible byte at this 5132 * node. */ 5133 n = sp->ts_arridx; 5134 if (sp->ts_curi > byts[n]) 5135 { 5136 /* Done all bytes at this node, go to next state. */ 5137 PROF_STORE(sp->ts_state) 5138 sp->ts_state = STATE_SWAP; 5139 break; 5140 } 5141 5142 /* Do one more byte at this node, but: 5143 * - Skip NUL bytes. 5144 * - Skip the byte if it's equal to the byte in the word, 5145 * accepting that byte is always better. 5146 */ 5147 n += sp->ts_curi++; 5148 c = byts[n]; 5149 if (soundfold && sp->ts_twordlen == 0 && c == '*') 5150 /* Inserting a vowel at the start of a word counts less, 5151 * see soundalike_score(). */ 5152 newscore = 2 * SCORE_INS / 3; 5153 else 5154 newscore = SCORE_INS; 5155 if (c != fword[sp->ts_fidx] 5156 && TRY_DEEPER(su, stack, depth, newscore)) 5157 { 5158 go_deeper(stack, depth, newscore); 5159 #ifdef DEBUG_TRIEWALK 5160 sprintf(changename[depth], "%.*s-%s: insert %c", 5161 sp->ts_twordlen, tword, fword + sp->ts_fidx, 5162 c); 5163 #endif 5164 ++depth; 5165 sp = &stack[depth]; 5166 tword[sp->ts_twordlen++] = c; 5167 sp->ts_arridx = idxs[n]; 5168 if (has_mbyte) 5169 { 5170 fl = MB_BYTE2LEN(c); 5171 if (fl > 1) 5172 { 5173 /* There are following bytes for the same character. 5174 * We must find all bytes before trying 5175 * delete/insert/swap/etc. */ 5176 sp->ts_tcharlen = fl; 5177 sp->ts_tcharidx = 1; 5178 sp->ts_isdiff = DIFF_INSERT; 5179 } 5180 } 5181 else 5182 fl = 1; 5183 if (fl == 1) 5184 { 5185 /* If the previous character was the same, thus doubling a 5186 * character, give a bonus to the score. Also for 5187 * soundfold words (illogical but does give a better 5188 * score). */ 5189 if (sp->ts_twordlen >= 2 5190 && tword[sp->ts_twordlen - 2] == c) 5191 sp->ts_score -= SCORE_INS - SCORE_INSDUP; 5192 } 5193 } 5194 break; 5195 5196 case STATE_SWAP: 5197 /* 5198 * Swap two bytes in the bad word: "12" -> "21". 5199 * We change "fword" here, it's changed back afterwards at 5200 * STATE_UNSWAP. 5201 */ 5202 p = fword + sp->ts_fidx; 5203 c = *p; 5204 if (c == NUL) 5205 { 5206 /* End of word, can't swap or replace. */ 5207 PROF_STORE(sp->ts_state) 5208 sp->ts_state = STATE_FINAL; 5209 break; 5210 } 5211 5212 /* Don't swap if the first character is not a word character. 5213 * SWAP3 etc. also don't make sense then. */ 5214 if (!soundfold && !spell_iswordp(p, curwin)) 5215 { 5216 PROF_STORE(sp->ts_state) 5217 sp->ts_state = STATE_REP_INI; 5218 break; 5219 } 5220 5221 if (has_mbyte) 5222 { 5223 n = MB_CPTR2LEN(p); 5224 c = mb_ptr2char(p); 5225 if (p[n] == NUL) 5226 c2 = NUL; 5227 else if (!soundfold && !spell_iswordp(p + n, curwin)) 5228 c2 = c; /* don't swap non-word char */ 5229 else 5230 c2 = mb_ptr2char(p + n); 5231 } 5232 else 5233 { 5234 if (p[1] == NUL) 5235 c2 = NUL; 5236 else if (!soundfold && !spell_iswordp(p + 1, curwin)) 5237 c2 = c; /* don't swap non-word char */ 5238 else 5239 c2 = p[1]; 5240 } 5241 5242 /* When the second character is NUL we can't swap. */ 5243 if (c2 == NUL) 5244 { 5245 PROF_STORE(sp->ts_state) 5246 sp->ts_state = STATE_REP_INI; 5247 break; 5248 } 5249 5250 /* When characters are identical, swap won't do anything. 5251 * Also get here if the second char is not a word character. */ 5252 if (c == c2) 5253 { 5254 PROF_STORE(sp->ts_state) 5255 sp->ts_state = STATE_SWAP3; 5256 break; 5257 } 5258 if (c2 != NUL && TRY_DEEPER(su, stack, depth, SCORE_SWAP)) 5259 { 5260 go_deeper(stack, depth, SCORE_SWAP); 5261 #ifdef DEBUG_TRIEWALK 5262 sprintf(changename[depth], "%.*s-%s: swap %c and %c", 5263 sp->ts_twordlen, tword, fword + sp->ts_fidx, 5264 c, c2); 5265 #endif 5266 PROF_STORE(sp->ts_state) 5267 sp->ts_state = STATE_UNSWAP; 5268 ++depth; 5269 if (has_mbyte) 5270 { 5271 fl = mb_char2len(c2); 5272 mch_memmove(p, p + n, fl); 5273 mb_char2bytes(c, p + fl); 5274 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl; 5275 } 5276 else 5277 { 5278 p[0] = c2; 5279 p[1] = c; 5280 stack[depth].ts_fidxtry = sp->ts_fidx + 2; 5281 } 5282 } 5283 else 5284 { 5285 /* If this swap doesn't work then SWAP3 won't either. */ 5286 PROF_STORE(sp->ts_state) 5287 sp->ts_state = STATE_REP_INI; 5288 } 5289 break; 5290 5291 case STATE_UNSWAP: 5292 /* Undo the STATE_SWAP swap: "21" -> "12". */ 5293 p = fword + sp->ts_fidx; 5294 if (has_mbyte) 5295 { 5296 n = MB_PTR2LEN(p); 5297 c = mb_ptr2char(p + n); 5298 mch_memmove(p + MB_PTR2LEN(p + n), p, n); 5299 mb_char2bytes(c, p); 5300 } 5301 else 5302 { 5303 c = *p; 5304 *p = p[1]; 5305 p[1] = c; 5306 } 5307 /* FALLTHROUGH */ 5308 5309 case STATE_SWAP3: 5310 /* Swap two bytes, skipping one: "123" -> "321". We change 5311 * "fword" here, it's changed back afterwards at STATE_UNSWAP3. */ 5312 p = fword + sp->ts_fidx; 5313 if (has_mbyte) 5314 { 5315 n = MB_CPTR2LEN(p); 5316 c = mb_ptr2char(p); 5317 fl = MB_CPTR2LEN(p + n); 5318 c2 = mb_ptr2char(p + n); 5319 if (!soundfold && !spell_iswordp(p + n + fl, curwin)) 5320 c3 = c; /* don't swap non-word char */ 5321 else 5322 c3 = mb_ptr2char(p + n + fl); 5323 } 5324 else 5325 { 5326 c = *p; 5327 c2 = p[1]; 5328 if (!soundfold && !spell_iswordp(p + 2, curwin)) 5329 c3 = c; /* don't swap non-word char */ 5330 else 5331 c3 = p[2]; 5332 } 5333 5334 /* When characters are identical: "121" then SWAP3 result is 5335 * identical, ROT3L result is same as SWAP: "211", ROT3L result is 5336 * same as SWAP on next char: "112". Thus skip all swapping. 5337 * Also skip when c3 is NUL. 5338 * Also get here when the third character is not a word character. 5339 * Second character may any char: "a.b" -> "b.a" */ 5340 if (c == c3 || c3 == NUL) 5341 { 5342 PROF_STORE(sp->ts_state) 5343 sp->ts_state = STATE_REP_INI; 5344 break; 5345 } 5346 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3)) 5347 { 5348 go_deeper(stack, depth, SCORE_SWAP3); 5349 #ifdef DEBUG_TRIEWALK 5350 sprintf(changename[depth], "%.*s-%s: swap3 %c and %c", 5351 sp->ts_twordlen, tword, fword + sp->ts_fidx, 5352 c, c3); 5353 #endif 5354 PROF_STORE(sp->ts_state) 5355 sp->ts_state = STATE_UNSWAP3; 5356 ++depth; 5357 if (has_mbyte) 5358 { 5359 tl = mb_char2len(c3); 5360 mch_memmove(p, p + n + fl, tl); 5361 mb_char2bytes(c2, p + tl); 5362 mb_char2bytes(c, p + fl + tl); 5363 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl + tl; 5364 } 5365 else 5366 { 5367 p[0] = p[2]; 5368 p[2] = c; 5369 stack[depth].ts_fidxtry = sp->ts_fidx + 3; 5370 } 5371 } 5372 else 5373 { 5374 PROF_STORE(sp->ts_state) 5375 sp->ts_state = STATE_REP_INI; 5376 } 5377 break; 5378 5379 case STATE_UNSWAP3: 5380 /* Undo STATE_SWAP3: "321" -> "123" */ 5381 p = fword + sp->ts_fidx; 5382 if (has_mbyte) 5383 { 5384 n = MB_PTR2LEN(p); 5385 c2 = mb_ptr2char(p + n); 5386 fl = MB_PTR2LEN(p + n); 5387 c = mb_ptr2char(p + n + fl); 5388 tl = MB_PTR2LEN(p + n + fl); 5389 mch_memmove(p + fl + tl, p, n); 5390 mb_char2bytes(c, p); 5391 mb_char2bytes(c2, p + tl); 5392 p = p + tl; 5393 } 5394 else 5395 { 5396 c = *p; 5397 *p = p[2]; 5398 p[2] = c; 5399 ++p; 5400 } 5401 5402 if (!soundfold && !spell_iswordp(p, curwin)) 5403 { 5404 /* Middle char is not a word char, skip the rotate. First and 5405 * third char were already checked at swap and swap3. */ 5406 PROF_STORE(sp->ts_state) 5407 sp->ts_state = STATE_REP_INI; 5408 break; 5409 } 5410 5411 /* Rotate three characters left: "123" -> "231". We change 5412 * "fword" here, it's changed back afterwards at STATE_UNROT3L. */ 5413 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3)) 5414 { 5415 go_deeper(stack, depth, SCORE_SWAP3); 5416 #ifdef DEBUG_TRIEWALK 5417 p = fword + sp->ts_fidx; 5418 sprintf(changename[depth], "%.*s-%s: rotate left %c%c%c", 5419 sp->ts_twordlen, tword, fword + sp->ts_fidx, 5420 p[0], p[1], p[2]); 5421 #endif 5422 PROF_STORE(sp->ts_state) 5423 sp->ts_state = STATE_UNROT3L; 5424 ++depth; 5425 p = fword + sp->ts_fidx; 5426 if (has_mbyte) 5427 { 5428 n = MB_CPTR2LEN(p); 5429 c = mb_ptr2char(p); 5430 fl = MB_CPTR2LEN(p + n); 5431 fl += MB_CPTR2LEN(p + n + fl); 5432 mch_memmove(p, p + n, fl); 5433 mb_char2bytes(c, p + fl); 5434 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl; 5435 } 5436 else 5437 { 5438 c = *p; 5439 *p = p[1]; 5440 p[1] = p[2]; 5441 p[2] = c; 5442 stack[depth].ts_fidxtry = sp->ts_fidx + 3; 5443 } 5444 } 5445 else 5446 { 5447 PROF_STORE(sp->ts_state) 5448 sp->ts_state = STATE_REP_INI; 5449 } 5450 break; 5451 5452 case STATE_UNROT3L: 5453 /* Undo ROT3L: "231" -> "123" */ 5454 p = fword + sp->ts_fidx; 5455 if (has_mbyte) 5456 { 5457 n = MB_PTR2LEN(p); 5458 n += MB_PTR2LEN(p + n); 5459 c = mb_ptr2char(p + n); 5460 tl = MB_PTR2LEN(p + n); 5461 mch_memmove(p + tl, p, n); 5462 mb_char2bytes(c, p); 5463 } 5464 else 5465 { 5466 c = p[2]; 5467 p[2] = p[1]; 5468 p[1] = *p; 5469 *p = c; 5470 } 5471 5472 /* Rotate three bytes right: "123" -> "312". We change "fword" 5473 * here, it's changed back afterwards at STATE_UNROT3R. */ 5474 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3)) 5475 { 5476 go_deeper(stack, depth, SCORE_SWAP3); 5477 #ifdef DEBUG_TRIEWALK 5478 p = fword + sp->ts_fidx; 5479 sprintf(changename[depth], "%.*s-%s: rotate right %c%c%c", 5480 sp->ts_twordlen, tword, fword + sp->ts_fidx, 5481 p[0], p[1], p[2]); 5482 #endif 5483 PROF_STORE(sp->ts_state) 5484 sp->ts_state = STATE_UNROT3R; 5485 ++depth; 5486 p = fword + sp->ts_fidx; 5487 if (has_mbyte) 5488 { 5489 n = MB_CPTR2LEN(p); 5490 n += MB_CPTR2LEN(p + n); 5491 c = mb_ptr2char(p + n); 5492 tl = MB_CPTR2LEN(p + n); 5493 mch_memmove(p + tl, p, n); 5494 mb_char2bytes(c, p); 5495 stack[depth].ts_fidxtry = sp->ts_fidx + n + tl; 5496 } 5497 else 5498 { 5499 c = p[2]; 5500 p[2] = p[1]; 5501 p[1] = *p; 5502 *p = c; 5503 stack[depth].ts_fidxtry = sp->ts_fidx + 3; 5504 } 5505 } 5506 else 5507 { 5508 PROF_STORE(sp->ts_state) 5509 sp->ts_state = STATE_REP_INI; 5510 } 5511 break; 5512 5513 case STATE_UNROT3R: 5514 /* Undo ROT3R: "312" -> "123" */ 5515 p = fword + sp->ts_fidx; 5516 if (has_mbyte) 5517 { 5518 c = mb_ptr2char(p); 5519 tl = MB_PTR2LEN(p); 5520 n = MB_PTR2LEN(p + tl); 5521 n += MB_PTR2LEN(p + tl + n); 5522 mch_memmove(p, p + tl, n); 5523 mb_char2bytes(c, p + n); 5524 } 5525 else 5526 { 5527 c = *p; 5528 *p = p[1]; 5529 p[1] = p[2]; 5530 p[2] = c; 5531 } 5532 /* FALLTHROUGH */ 5533 5534 case STATE_REP_INI: 5535 /* Check if matching with REP items from the .aff file would work. 5536 * Quickly skip if: 5537 * - there are no REP items and we are not in the soundfold trie 5538 * - the score is going to be too high anyway 5539 * - already applied a REP item or swapped here */ 5540 if ((lp->lp_replang == NULL && !soundfold) 5541 || sp->ts_score + SCORE_REP >= su->su_maxscore 5542 || sp->ts_fidx < sp->ts_fidxtry) 5543 { 5544 PROF_STORE(sp->ts_state) 5545 sp->ts_state = STATE_FINAL; 5546 break; 5547 } 5548 5549 /* Use the first byte to quickly find the first entry that may 5550 * match. If the index is -1 there is none. */ 5551 if (soundfold) 5552 sp->ts_curi = slang->sl_repsal_first[fword[sp->ts_fidx]]; 5553 else 5554 sp->ts_curi = lp->lp_replang->sl_rep_first[fword[sp->ts_fidx]]; 5555 5556 if (sp->ts_curi < 0) 5557 { 5558 PROF_STORE(sp->ts_state) 5559 sp->ts_state = STATE_FINAL; 5560 break; 5561 } 5562 5563 PROF_STORE(sp->ts_state) 5564 sp->ts_state = STATE_REP; 5565 /* FALLTHROUGH */ 5566 5567 case STATE_REP: 5568 /* Try matching with REP items from the .aff file. For each match 5569 * replace the characters and check if the resulting word is 5570 * valid. */ 5571 p = fword + sp->ts_fidx; 5572 5573 if (soundfold) 5574 gap = &slang->sl_repsal; 5575 else 5576 gap = &lp->lp_replang->sl_rep; 5577 while (sp->ts_curi < gap->ga_len) 5578 { 5579 ftp = (fromto_T *)gap->ga_data + sp->ts_curi++; 5580 if (*ftp->ft_from != *p) 5581 { 5582 /* past possible matching entries */ 5583 sp->ts_curi = gap->ga_len; 5584 break; 5585 } 5586 if (STRNCMP(ftp->ft_from, p, STRLEN(ftp->ft_from)) == 0 5587 && TRY_DEEPER(su, stack, depth, SCORE_REP)) 5588 { 5589 go_deeper(stack, depth, SCORE_REP); 5590 #ifdef DEBUG_TRIEWALK 5591 sprintf(changename[depth], "%.*s-%s: replace %s with %s", 5592 sp->ts_twordlen, tword, fword + sp->ts_fidx, 5593 ftp->ft_from, ftp->ft_to); 5594 #endif 5595 /* Need to undo this afterwards. */ 5596 PROF_STORE(sp->ts_state) 5597 sp->ts_state = STATE_REP_UNDO; 5598 5599 /* Change the "from" to the "to" string. */ 5600 ++depth; 5601 fl = (int)STRLEN(ftp->ft_from); 5602 tl = (int)STRLEN(ftp->ft_to); 5603 if (fl != tl) 5604 { 5605 STRMOVE(p + tl, p + fl); 5606 repextra += tl - fl; 5607 } 5608 mch_memmove(p, ftp->ft_to, tl); 5609 stack[depth].ts_fidxtry = sp->ts_fidx + tl; 5610 stack[depth].ts_tcharlen = 0; 5611 break; 5612 } 5613 } 5614 5615 if (sp->ts_curi >= gap->ga_len && sp->ts_state == STATE_REP) 5616 { 5617 /* No (more) matches. */ 5618 PROF_STORE(sp->ts_state) 5619 sp->ts_state = STATE_FINAL; 5620 } 5621 5622 break; 5623 5624 case STATE_REP_UNDO: 5625 /* Undo a REP replacement and continue with the next one. */ 5626 if (soundfold) 5627 gap = &slang->sl_repsal; 5628 else 5629 gap = &lp->lp_replang->sl_rep; 5630 ftp = (fromto_T *)gap->ga_data + sp->ts_curi - 1; 5631 fl = (int)STRLEN(ftp->ft_from); 5632 tl = (int)STRLEN(ftp->ft_to); 5633 p = fword + sp->ts_fidx; 5634 if (fl != tl) 5635 { 5636 STRMOVE(p + fl, p + tl); 5637 repextra -= tl - fl; 5638 } 5639 mch_memmove(p, ftp->ft_from, fl); 5640 PROF_STORE(sp->ts_state) 5641 sp->ts_state = STATE_REP; 5642 break; 5643 5644 default: 5645 /* Did all possible states at this level, go up one level. */ 5646 --depth; 5647 5648 if (depth >= 0 && stack[depth].ts_prefixdepth == PFD_PREFIXTREE) 5649 { 5650 /* Continue in or go back to the prefix tree. */ 5651 byts = pbyts; 5652 idxs = pidxs; 5653 } 5654 5655 /* Don't check for CTRL-C too often, it takes time. */ 5656 if (--breakcheckcount == 0) 5657 { 5658 ui_breakcheck(); 5659 breakcheckcount = 1000; 5660 } 5661 } 5662 } 5663 } 5664 5665 5666 /* 5667 * Go one level deeper in the tree. 5668 */ 5669 static void 5670 go_deeper(trystate_T *stack, int depth, int score_add) 5671 { 5672 stack[depth + 1] = stack[depth]; 5673 stack[depth + 1].ts_state = STATE_START; 5674 stack[depth + 1].ts_score = stack[depth].ts_score + score_add; 5675 stack[depth + 1].ts_curi = 1; /* start just after length byte */ 5676 stack[depth + 1].ts_flags = 0; 5677 } 5678 5679 /* 5680 * Case-folding may change the number of bytes: Count nr of chars in 5681 * fword[flen] and return the byte length of that many chars in "word". 5682 */ 5683 static int 5684 nofold_len(char_u *fword, int flen, char_u *word) 5685 { 5686 char_u *p; 5687 int i = 0; 5688 5689 for (p = fword; p < fword + flen; MB_PTR_ADV(p)) 5690 ++i; 5691 for (p = word; i > 0; MB_PTR_ADV(p)) 5692 --i; 5693 return (int)(p - word); 5694 } 5695 5696 /* 5697 * "fword" is a good word with case folded. Find the matching keep-case 5698 * words and put it in "kword". 5699 * Theoretically there could be several keep-case words that result in the 5700 * same case-folded word, but we only find one... 5701 */ 5702 static void 5703 find_keepcap_word(slang_T *slang, char_u *fword, char_u *kword) 5704 { 5705 char_u uword[MAXWLEN]; /* "fword" in upper-case */ 5706 int depth; 5707 idx_T tryidx; 5708 5709 /* The following arrays are used at each depth in the tree. */ 5710 idx_T arridx[MAXWLEN]; 5711 int round[MAXWLEN]; 5712 int fwordidx[MAXWLEN]; 5713 int uwordidx[MAXWLEN]; 5714 int kwordlen[MAXWLEN]; 5715 5716 int flen, ulen; 5717 int l; 5718 int len; 5719 int c; 5720 idx_T lo, hi, m; 5721 char_u *p; 5722 char_u *byts = slang->sl_kbyts; /* array with bytes of the words */ 5723 idx_T *idxs = slang->sl_kidxs; /* array with indexes */ 5724 5725 if (byts == NULL) 5726 { 5727 /* array is empty: "cannot happen" */ 5728 *kword = NUL; 5729 return; 5730 } 5731 5732 /* Make an all-cap version of "fword". */ 5733 allcap_copy(fword, uword); 5734 5735 /* 5736 * Each character needs to be tried both case-folded and upper-case. 5737 * All this gets very complicated if we keep in mind that changing case 5738 * may change the byte length of a multi-byte character... 5739 */ 5740 depth = 0; 5741 arridx[0] = 0; 5742 round[0] = 0; 5743 fwordidx[0] = 0; 5744 uwordidx[0] = 0; 5745 kwordlen[0] = 0; 5746 while (depth >= 0) 5747 { 5748 if (fword[fwordidx[depth]] == NUL) 5749 { 5750 /* We are at the end of "fword". If the tree allows a word to end 5751 * here we have found a match. */ 5752 if (byts[arridx[depth] + 1] == 0) 5753 { 5754 kword[kwordlen[depth]] = NUL; 5755 return; 5756 } 5757 5758 /* kword is getting too long, continue one level up */ 5759 --depth; 5760 } 5761 else if (++round[depth] > 2) 5762 { 5763 /* tried both fold-case and upper-case character, continue one 5764 * level up */ 5765 --depth; 5766 } 5767 else 5768 { 5769 /* 5770 * round[depth] == 1: Try using the folded-case character. 5771 * round[depth] == 2: Try using the upper-case character. 5772 */ 5773 if (has_mbyte) 5774 { 5775 flen = MB_CPTR2LEN(fword + fwordidx[depth]); 5776 ulen = MB_CPTR2LEN(uword + uwordidx[depth]); 5777 } 5778 else 5779 ulen = flen = 1; 5780 if (round[depth] == 1) 5781 { 5782 p = fword + fwordidx[depth]; 5783 l = flen; 5784 } 5785 else 5786 { 5787 p = uword + uwordidx[depth]; 5788 l = ulen; 5789 } 5790 5791 for (tryidx = arridx[depth]; l > 0; --l) 5792 { 5793 /* Perform a binary search in the list of accepted bytes. */ 5794 len = byts[tryidx++]; 5795 c = *p++; 5796 lo = tryidx; 5797 hi = tryidx + len - 1; 5798 while (lo < hi) 5799 { 5800 m = (lo + hi) / 2; 5801 if (byts[m] > c) 5802 hi = m - 1; 5803 else if (byts[m] < c) 5804 lo = m + 1; 5805 else 5806 { 5807 lo = hi = m; 5808 break; 5809 } 5810 } 5811 5812 /* Stop if there is no matching byte. */ 5813 if (hi < lo || byts[lo] != c) 5814 break; 5815 5816 /* Continue at the child (if there is one). */ 5817 tryidx = idxs[lo]; 5818 } 5819 5820 if (l == 0) 5821 { 5822 /* 5823 * Found the matching char. Copy it to "kword" and go a 5824 * level deeper. 5825 */ 5826 if (round[depth] == 1) 5827 { 5828 STRNCPY(kword + kwordlen[depth], fword + fwordidx[depth], 5829 flen); 5830 kwordlen[depth + 1] = kwordlen[depth] + flen; 5831 } 5832 else 5833 { 5834 STRNCPY(kword + kwordlen[depth], uword + uwordidx[depth], 5835 ulen); 5836 kwordlen[depth + 1] = kwordlen[depth] + ulen; 5837 } 5838 fwordidx[depth + 1] = fwordidx[depth] + flen; 5839 uwordidx[depth + 1] = uwordidx[depth] + ulen; 5840 5841 ++depth; 5842 arridx[depth] = tryidx; 5843 round[depth] = 0; 5844 } 5845 } 5846 } 5847 5848 /* Didn't find it: "cannot happen". */ 5849 *kword = NUL; 5850 } 5851 5852 /* 5853 * Compute the sound-a-like score for suggestions in su->su_ga and add them to 5854 * su->su_sga. 5855 */ 5856 static void 5857 score_comp_sal(suginfo_T *su) 5858 { 5859 langp_T *lp; 5860 char_u badsound[MAXWLEN]; 5861 int i; 5862 suggest_T *stp; 5863 suggest_T *sstp; 5864 int score; 5865 int lpi; 5866 5867 if (ga_grow(&su->su_sga, su->su_ga.ga_len) == FAIL) 5868 return; 5869 5870 /* Use the sound-folding of the first language that supports it. */ 5871 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 5872 { 5873 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 5874 if (lp->lp_slang->sl_sal.ga_len > 0) 5875 { 5876 /* soundfold the bad word */ 5877 spell_soundfold(lp->lp_slang, su->su_fbadword, TRUE, badsound); 5878 5879 for (i = 0; i < su->su_ga.ga_len; ++i) 5880 { 5881 stp = &SUG(su->su_ga, i); 5882 5883 /* Case-fold the suggested word, sound-fold it and compute the 5884 * sound-a-like score. */ 5885 score = stp_sal_score(stp, su, lp->lp_slang, badsound); 5886 if (score < SCORE_MAXMAX) 5887 { 5888 /* Add the suggestion. */ 5889 sstp = &SUG(su->su_sga, su->su_sga.ga_len); 5890 sstp->st_word = vim_strsave(stp->st_word); 5891 if (sstp->st_word != NULL) 5892 { 5893 sstp->st_wordlen = stp->st_wordlen; 5894 sstp->st_score = score; 5895 sstp->st_altscore = 0; 5896 sstp->st_orglen = stp->st_orglen; 5897 ++su->su_sga.ga_len; 5898 } 5899 } 5900 } 5901 break; 5902 } 5903 } 5904 } 5905 5906 /* 5907 * Combine the list of suggestions in su->su_ga and su->su_sga. 5908 * They are entwined. 5909 */ 5910 static void 5911 score_combine(suginfo_T *su) 5912 { 5913 int i; 5914 int j; 5915 garray_T ga; 5916 garray_T *gap; 5917 langp_T *lp; 5918 suggest_T *stp; 5919 char_u *p; 5920 char_u badsound[MAXWLEN]; 5921 int round; 5922 int lpi; 5923 slang_T *slang = NULL; 5924 5925 /* Add the alternate score to su_ga. */ 5926 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 5927 { 5928 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 5929 if (lp->lp_slang->sl_sal.ga_len > 0) 5930 { 5931 /* soundfold the bad word */ 5932 slang = lp->lp_slang; 5933 spell_soundfold(slang, su->su_fbadword, TRUE, badsound); 5934 5935 for (i = 0; i < su->su_ga.ga_len; ++i) 5936 { 5937 stp = &SUG(su->su_ga, i); 5938 stp->st_altscore = stp_sal_score(stp, su, slang, badsound); 5939 if (stp->st_altscore == SCORE_MAXMAX) 5940 stp->st_score = (stp->st_score * 3 + SCORE_BIG) / 4; 5941 else 5942 stp->st_score = (stp->st_score * 3 5943 + stp->st_altscore) / 4; 5944 stp->st_salscore = FALSE; 5945 } 5946 break; 5947 } 5948 } 5949 5950 if (slang == NULL) /* Using "double" without sound folding. */ 5951 { 5952 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, 5953 su->su_maxcount); 5954 return; 5955 } 5956 5957 /* Add the alternate score to su_sga. */ 5958 for (i = 0; i < su->su_sga.ga_len; ++i) 5959 { 5960 stp = &SUG(su->su_sga, i); 5961 stp->st_altscore = spell_edit_score(slang, 5962 su->su_badword, stp->st_word); 5963 if (stp->st_score == SCORE_MAXMAX) 5964 stp->st_score = (SCORE_BIG * 7 + stp->st_altscore) / 8; 5965 else 5966 stp->st_score = (stp->st_score * 7 + stp->st_altscore) / 8; 5967 stp->st_salscore = TRUE; 5968 } 5969 5970 /* Remove bad suggestions, sort the suggestions and truncate at "maxcount" 5971 * for both lists. */ 5972 check_suggestions(su, &su->su_ga); 5973 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); 5974 check_suggestions(su, &su->su_sga); 5975 (void)cleanup_suggestions(&su->su_sga, su->su_maxscore, su->su_maxcount); 5976 5977 ga_init2(&ga, (int)sizeof(suginfo_T), 1); 5978 if (ga_grow(&ga, su->su_ga.ga_len + su->su_sga.ga_len) == FAIL) 5979 return; 5980 5981 stp = &SUG(ga, 0); 5982 for (i = 0; i < su->su_ga.ga_len || i < su->su_sga.ga_len; ++i) 5983 { 5984 /* round 1: get a suggestion from su_ga 5985 * round 2: get a suggestion from su_sga */ 5986 for (round = 1; round <= 2; ++round) 5987 { 5988 gap = round == 1 ? &su->su_ga : &su->su_sga; 5989 if (i < gap->ga_len) 5990 { 5991 /* Don't add a word if it's already there. */ 5992 p = SUG(*gap, i).st_word; 5993 for (j = 0; j < ga.ga_len; ++j) 5994 if (STRCMP(stp[j].st_word, p) == 0) 5995 break; 5996 if (j == ga.ga_len) 5997 stp[ga.ga_len++] = SUG(*gap, i); 5998 else 5999 vim_free(p); 6000 } 6001 } 6002 } 6003 6004 ga_clear(&su->su_ga); 6005 ga_clear(&su->su_sga); 6006 6007 /* Truncate the list to the number of suggestions that will be displayed. */ 6008 if (ga.ga_len > su->su_maxcount) 6009 { 6010 for (i = su->su_maxcount; i < ga.ga_len; ++i) 6011 vim_free(stp[i].st_word); 6012 ga.ga_len = su->su_maxcount; 6013 } 6014 6015 su->su_ga = ga; 6016 } 6017 6018 /* 6019 * For the goodword in "stp" compute the soundalike score compared to the 6020 * badword. 6021 */ 6022 static int 6023 stp_sal_score( 6024 suggest_T *stp, 6025 suginfo_T *su, 6026 slang_T *slang, 6027 char_u *badsound) /* sound-folded badword */ 6028 { 6029 char_u *p; 6030 char_u *pbad; 6031 char_u *pgood; 6032 char_u badsound2[MAXWLEN]; 6033 char_u fword[MAXWLEN]; 6034 char_u goodsound[MAXWLEN]; 6035 char_u goodword[MAXWLEN]; 6036 int lendiff; 6037 6038 lendiff = (int)(su->su_badlen - stp->st_orglen); 6039 if (lendiff >= 0) 6040 pbad = badsound; 6041 else 6042 { 6043 /* soundfold the bad word with more characters following */ 6044 (void)spell_casefold(su->su_badptr, stp->st_orglen, fword, MAXWLEN); 6045 6046 /* When joining two words the sound often changes a lot. E.g., "t he" 6047 * sounds like "t h" while "the" sounds like "@". Avoid that by 6048 * removing the space. Don't do it when the good word also contains a 6049 * space. */ 6050 if (VIM_ISWHITE(su->su_badptr[su->su_badlen]) 6051 && *skiptowhite(stp->st_word) == NUL) 6052 for (p = fword; *(p = skiptowhite(p)) != NUL; ) 6053 STRMOVE(p, p + 1); 6054 6055 spell_soundfold(slang, fword, TRUE, badsound2); 6056 pbad = badsound2; 6057 } 6058 6059 if (lendiff > 0 && stp->st_wordlen + lendiff < MAXWLEN) 6060 { 6061 /* Add part of the bad word to the good word, so that we soundfold 6062 * what replaces the bad word. */ 6063 STRCPY(goodword, stp->st_word); 6064 vim_strncpy(goodword + stp->st_wordlen, 6065 su->su_badptr + su->su_badlen - lendiff, lendiff); 6066 pgood = goodword; 6067 } 6068 else 6069 pgood = stp->st_word; 6070 6071 /* Sound-fold the word and compute the score for the difference. */ 6072 spell_soundfold(slang, pgood, FALSE, goodsound); 6073 6074 return soundalike_score(goodsound, pbad); 6075 } 6076 6077 /* structure used to store soundfolded words that add_sound_suggest() has 6078 * handled already. */ 6079 typedef struct 6080 { 6081 short sft_score; /* lowest score used */ 6082 char_u sft_word[1]; /* soundfolded word, actually longer */ 6083 } sftword_T; 6084 6085 static sftword_T dumsft; 6086 #define HIKEY2SFT(p) ((sftword_T *)(p - (dumsft.sft_word - (char_u *)&dumsft))) 6087 #define HI2SFT(hi) HIKEY2SFT((hi)->hi_key) 6088 6089 /* 6090 * Prepare for calling suggest_try_soundalike(). 6091 */ 6092 static void 6093 suggest_try_soundalike_prep(void) 6094 { 6095 langp_T *lp; 6096 int lpi; 6097 slang_T *slang; 6098 6099 /* Do this for all languages that support sound folding and for which a 6100 * .sug file has been loaded. */ 6101 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 6102 { 6103 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 6104 slang = lp->lp_slang; 6105 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL) 6106 /* prepare the hashtable used by add_sound_suggest() */ 6107 hash_init(&slang->sl_sounddone); 6108 } 6109 } 6110 6111 /* 6112 * Find suggestions by comparing the word in a sound-a-like form. 6113 * Note: This doesn't support postponed prefixes. 6114 */ 6115 static void 6116 suggest_try_soundalike(suginfo_T *su) 6117 { 6118 char_u salword[MAXWLEN]; 6119 langp_T *lp; 6120 int lpi; 6121 slang_T *slang; 6122 6123 /* Do this for all languages that support sound folding and for which a 6124 * .sug file has been loaded. */ 6125 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 6126 { 6127 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 6128 slang = lp->lp_slang; 6129 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL) 6130 { 6131 /* soundfold the bad word */ 6132 spell_soundfold(slang, su->su_fbadword, TRUE, salword); 6133 6134 /* try all kinds of inserts/deletes/swaps/etc. */ 6135 /* TODO: also soundfold the next words, so that we can try joining 6136 * and splitting */ 6137 #ifdef SUGGEST_PROFILE 6138 prof_init(); 6139 #endif 6140 suggest_trie_walk(su, lp, salword, TRUE); 6141 #ifdef SUGGEST_PROFILE 6142 prof_report("soundalike"); 6143 #endif 6144 } 6145 } 6146 } 6147 6148 /* 6149 * Finish up after calling suggest_try_soundalike(). 6150 */ 6151 static void 6152 suggest_try_soundalike_finish(void) 6153 { 6154 langp_T *lp; 6155 int lpi; 6156 slang_T *slang; 6157 int todo; 6158 hashitem_T *hi; 6159 6160 /* Do this for all languages that support sound folding and for which a 6161 * .sug file has been loaded. */ 6162 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 6163 { 6164 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 6165 slang = lp->lp_slang; 6166 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL) 6167 { 6168 /* Free the info about handled words. */ 6169 todo = (int)slang->sl_sounddone.ht_used; 6170 for (hi = slang->sl_sounddone.ht_array; todo > 0; ++hi) 6171 if (!HASHITEM_EMPTY(hi)) 6172 { 6173 vim_free(HI2SFT(hi)); 6174 --todo; 6175 } 6176 6177 /* Clear the hashtable, it may also be used by another region. */ 6178 hash_clear(&slang->sl_sounddone); 6179 hash_init(&slang->sl_sounddone); 6180 } 6181 } 6182 } 6183 6184 /* 6185 * A match with a soundfolded word is found. Add the good word(s) that 6186 * produce this soundfolded word. 6187 */ 6188 static void 6189 add_sound_suggest( 6190 suginfo_T *su, 6191 char_u *goodword, 6192 int score, /* soundfold score */ 6193 langp_T *lp) 6194 { 6195 slang_T *slang = lp->lp_slang; /* language for sound folding */ 6196 int sfwordnr; 6197 char_u *nrline; 6198 int orgnr; 6199 char_u theword[MAXWLEN]; 6200 int i; 6201 int wlen; 6202 char_u *byts; 6203 idx_T *idxs; 6204 int n; 6205 int wordcount; 6206 int wc; 6207 int goodscore; 6208 hash_T hash; 6209 hashitem_T *hi; 6210 sftword_T *sft; 6211 int bc, gc; 6212 int limit; 6213 6214 /* 6215 * It's very well possible that the same soundfold word is found several 6216 * times with different scores. Since the following is quite slow only do 6217 * the words that have a better score than before. Use a hashtable to 6218 * remember the words that have been done. 6219 */ 6220 hash = hash_hash(goodword); 6221 hi = hash_lookup(&slang->sl_sounddone, goodword, hash); 6222 if (HASHITEM_EMPTY(hi)) 6223 { 6224 sft = (sftword_T *)alloc((unsigned)(sizeof(sftword_T) 6225 + STRLEN(goodword))); 6226 if (sft != NULL) 6227 { 6228 sft->sft_score = score; 6229 STRCPY(sft->sft_word, goodword); 6230 hash_add_item(&slang->sl_sounddone, hi, sft->sft_word, hash); 6231 } 6232 } 6233 else 6234 { 6235 sft = HI2SFT(hi); 6236 if (score >= sft->sft_score) 6237 return; 6238 sft->sft_score = score; 6239 } 6240 6241 /* 6242 * Find the word nr in the soundfold tree. 6243 */ 6244 sfwordnr = soundfold_find(slang, goodword); 6245 if (sfwordnr < 0) 6246 { 6247 internal_error("add_sound_suggest()"); 6248 return; 6249 } 6250 6251 /* 6252 * go over the list of good words that produce this soundfold word 6253 */ 6254 nrline = ml_get_buf(slang->sl_sugbuf, (linenr_T)(sfwordnr + 1), FALSE); 6255 orgnr = 0; 6256 while (*nrline != NUL) 6257 { 6258 /* The wordnr was stored in a minimal nr of bytes as an offset to the 6259 * previous wordnr. */ 6260 orgnr += bytes2offset(&nrline); 6261 6262 byts = slang->sl_fbyts; 6263 idxs = slang->sl_fidxs; 6264 6265 /* Lookup the word "orgnr" one of the two tries. */ 6266 n = 0; 6267 wordcount = 0; 6268 for (wlen = 0; wlen < MAXWLEN - 3; ++wlen) 6269 { 6270 i = 1; 6271 if (wordcount == orgnr && byts[n + 1] == NUL) 6272 break; /* found end of word */ 6273 6274 if (byts[n + 1] == NUL) 6275 ++wordcount; 6276 6277 /* skip over the NUL bytes */ 6278 for ( ; byts[n + i] == NUL; ++i) 6279 if (i > byts[n]) /* safety check */ 6280 { 6281 STRCPY(theword + wlen, "BAD"); 6282 wlen += 3; 6283 goto badword; 6284 } 6285 6286 /* One of the siblings must have the word. */ 6287 for ( ; i < byts[n]; ++i) 6288 { 6289 wc = idxs[idxs[n + i]]; /* nr of words under this byte */ 6290 if (wordcount + wc > orgnr) 6291 break; 6292 wordcount += wc; 6293 } 6294 6295 theword[wlen] = byts[n + i]; 6296 n = idxs[n + i]; 6297 } 6298 badword: 6299 theword[wlen] = NUL; 6300 6301 /* Go over the possible flags and regions. */ 6302 for (; i <= byts[n] && byts[n + i] == NUL; ++i) 6303 { 6304 char_u cword[MAXWLEN]; 6305 char_u *p; 6306 int flags = (int)idxs[n + i]; 6307 6308 /* Skip words with the NOSUGGEST flag */ 6309 if (flags & WF_NOSUGGEST) 6310 continue; 6311 6312 if (flags & WF_KEEPCAP) 6313 { 6314 /* Must find the word in the keep-case tree. */ 6315 find_keepcap_word(slang, theword, cword); 6316 p = cword; 6317 } 6318 else 6319 { 6320 flags |= su->su_badflags; 6321 if ((flags & WF_CAPMASK) != 0) 6322 { 6323 /* Need to fix case according to "flags". */ 6324 make_case_word(theword, cword, flags); 6325 p = cword; 6326 } 6327 else 6328 p = theword; 6329 } 6330 6331 /* Add the suggestion. */ 6332 if (sps_flags & SPS_DOUBLE) 6333 { 6334 /* Add the suggestion if the score isn't too bad. */ 6335 if (score <= su->su_maxscore) 6336 add_suggestion(su, &su->su_sga, p, su->su_badlen, 6337 score, 0, FALSE, slang, FALSE); 6338 } 6339 else 6340 { 6341 /* Add a penalty for words in another region. */ 6342 if ((flags & WF_REGION) 6343 && (((unsigned)flags >> 16) & lp->lp_region) == 0) 6344 goodscore = SCORE_REGION; 6345 else 6346 goodscore = 0; 6347 6348 /* Add a small penalty for changing the first letter from 6349 * lower to upper case. Helps for "tath" -> "Kath", which is 6350 * less common than "tath" -> "path". Don't do it when the 6351 * letter is the same, that has already been counted. */ 6352 gc = PTR2CHAR(p); 6353 if (SPELL_ISUPPER(gc)) 6354 { 6355 bc = PTR2CHAR(su->su_badword); 6356 if (!SPELL_ISUPPER(bc) 6357 && SPELL_TOFOLD(bc) != SPELL_TOFOLD(gc)) 6358 goodscore += SCORE_ICASE / 2; 6359 } 6360 6361 /* Compute the score for the good word. This only does letter 6362 * insert/delete/swap/replace. REP items are not considered, 6363 * which may make the score a bit higher. 6364 * Use a limit for the score to make it work faster. Use 6365 * MAXSCORE(), because RESCORE() will change the score. 6366 * If the limit is very high then the iterative method is 6367 * inefficient, using an array is quicker. */ 6368 limit = MAXSCORE(su->su_sfmaxscore - goodscore, score); 6369 if (limit > SCORE_LIMITMAX) 6370 goodscore += spell_edit_score(slang, su->su_badword, p); 6371 else 6372 goodscore += spell_edit_score_limit(slang, su->su_badword, 6373 p, limit); 6374 6375 /* When going over the limit don't bother to do the rest. */ 6376 if (goodscore < SCORE_MAXMAX) 6377 { 6378 /* Give a bonus to words seen before. */ 6379 goodscore = score_wordcount_adj(slang, goodscore, p, FALSE); 6380 6381 /* Add the suggestion if the score isn't too bad. */ 6382 goodscore = RESCORE(goodscore, score); 6383 if (goodscore <= su->su_sfmaxscore) 6384 add_suggestion(su, &su->su_ga, p, su->su_badlen, 6385 goodscore, score, TRUE, slang, TRUE); 6386 } 6387 } 6388 } 6389 /* smsg("word %s (%d): %s (%d)", sftword, sftnr, theword, orgnr); */ 6390 } 6391 } 6392 6393 /* 6394 * Find word "word" in fold-case tree for "slang" and return the word number. 6395 */ 6396 static int 6397 soundfold_find(slang_T *slang, char_u *word) 6398 { 6399 idx_T arridx = 0; 6400 int len; 6401 int wlen = 0; 6402 int c; 6403 char_u *ptr = word; 6404 char_u *byts; 6405 idx_T *idxs; 6406 int wordnr = 0; 6407 6408 byts = slang->sl_sbyts; 6409 idxs = slang->sl_sidxs; 6410 6411 for (;;) 6412 { 6413 /* First byte is the number of possible bytes. */ 6414 len = byts[arridx++]; 6415 6416 /* If the first possible byte is a zero the word could end here. 6417 * If the word ends we found the word. If not skip the NUL bytes. */ 6418 c = ptr[wlen]; 6419 if (byts[arridx] == NUL) 6420 { 6421 if (c == NUL) 6422 break; 6423 6424 /* Skip over the zeros, there can be several. */ 6425 while (len > 0 && byts[arridx] == NUL) 6426 { 6427 ++arridx; 6428 --len; 6429 } 6430 if (len == 0) 6431 return -1; /* no children, word should have ended here */ 6432 ++wordnr; 6433 } 6434 6435 /* If the word ends we didn't find it. */ 6436 if (c == NUL) 6437 return -1; 6438 6439 /* Perform a binary search in the list of accepted bytes. */ 6440 if (c == TAB) /* <Tab> is handled like <Space> */ 6441 c = ' '; 6442 while (byts[arridx] < c) 6443 { 6444 /* The word count is in the first idxs[] entry of the child. */ 6445 wordnr += idxs[idxs[arridx]]; 6446 ++arridx; 6447 if (--len == 0) /* end of the bytes, didn't find it */ 6448 return -1; 6449 } 6450 if (byts[arridx] != c) /* didn't find the byte */ 6451 return -1; 6452 6453 /* Continue at the child (if there is one). */ 6454 arridx = idxs[arridx]; 6455 ++wlen; 6456 6457 /* One space in the good word may stand for several spaces in the 6458 * checked word. */ 6459 if (c == ' ') 6460 while (ptr[wlen] == ' ' || ptr[wlen] == TAB) 6461 ++wlen; 6462 } 6463 6464 return wordnr; 6465 } 6466 6467 /* 6468 * Copy "fword" to "cword", fixing case according to "flags". 6469 */ 6470 static void 6471 make_case_word(char_u *fword, char_u *cword, int flags) 6472 { 6473 if (flags & WF_ALLCAP) 6474 /* Make it all upper-case */ 6475 allcap_copy(fword, cword); 6476 else if (flags & WF_ONECAP) 6477 /* Make the first letter upper-case */ 6478 onecap_copy(fword, cword, TRUE); 6479 else 6480 /* Use goodword as-is. */ 6481 STRCPY(cword, fword); 6482 } 6483 6484 6485 /* 6486 * Return TRUE if "c1" and "c2" are similar characters according to the MAP 6487 * lines in the .aff file. 6488 */ 6489 static int 6490 similar_chars(slang_T *slang, int c1, int c2) 6491 { 6492 int m1, m2; 6493 char_u buf[MB_MAXBYTES + 1]; 6494 hashitem_T *hi; 6495 6496 if (c1 >= 256) 6497 { 6498 buf[mb_char2bytes(c1, buf)] = 0; 6499 hi = hash_find(&slang->sl_map_hash, buf); 6500 if (HASHITEM_EMPTY(hi)) 6501 m1 = 0; 6502 else 6503 m1 = mb_ptr2char(hi->hi_key + STRLEN(hi->hi_key) + 1); 6504 } 6505 else 6506 m1 = slang->sl_map_array[c1]; 6507 if (m1 == 0) 6508 return FALSE; 6509 6510 6511 if (c2 >= 256) 6512 { 6513 buf[mb_char2bytes(c2, buf)] = 0; 6514 hi = hash_find(&slang->sl_map_hash, buf); 6515 if (HASHITEM_EMPTY(hi)) 6516 m2 = 0; 6517 else 6518 m2 = mb_ptr2char(hi->hi_key + STRLEN(hi->hi_key) + 1); 6519 } 6520 else 6521 m2 = slang->sl_map_array[c2]; 6522 6523 return m1 == m2; 6524 } 6525 6526 /* 6527 * Add a suggestion to the list of suggestions. 6528 * For a suggestion that is already in the list the lowest score is remembered. 6529 */ 6530 static void 6531 add_suggestion( 6532 suginfo_T *su, 6533 garray_T *gap, /* either su_ga or su_sga */ 6534 char_u *goodword, 6535 int badlenarg, /* len of bad word replaced with "goodword" */ 6536 int score, 6537 int altscore, 6538 int had_bonus, /* value for st_had_bonus */ 6539 slang_T *slang, /* language for sound folding */ 6540 int maxsf) /* su_maxscore applies to soundfold score, 6541 su_sfmaxscore to the total score. */ 6542 { 6543 int goodlen; /* len of goodword changed */ 6544 int badlen; /* len of bad word changed */ 6545 suggest_T *stp; 6546 suggest_T new_sug; 6547 int i; 6548 char_u *pgood, *pbad; 6549 6550 /* Minimize "badlen" for consistency. Avoids that changing "the the" to 6551 * "thee the" is added next to changing the first "the" the "thee". */ 6552 pgood = goodword + STRLEN(goodword); 6553 pbad = su->su_badptr + badlenarg; 6554 for (;;) 6555 { 6556 goodlen = (int)(pgood - goodword); 6557 badlen = (int)(pbad - su->su_badptr); 6558 if (goodlen <= 0 || badlen <= 0) 6559 break; 6560 MB_PTR_BACK(goodword, pgood); 6561 MB_PTR_BACK(su->su_badptr, pbad); 6562 if (has_mbyte) 6563 { 6564 if (mb_ptr2char(pgood) != mb_ptr2char(pbad)) 6565 break; 6566 } 6567 else if (*pgood != *pbad) 6568 break; 6569 } 6570 6571 if (badlen == 0 && goodlen == 0) 6572 /* goodword doesn't change anything; may happen for "the the" changing 6573 * the first "the" to itself. */ 6574 return; 6575 6576 if (gap->ga_len == 0) 6577 i = -1; 6578 else 6579 { 6580 /* Check if the word is already there. Also check the length that is 6581 * being replaced "thes," -> "these" is a different suggestion from 6582 * "thes" -> "these". */ 6583 stp = &SUG(*gap, 0); 6584 for (i = gap->ga_len; --i >= 0; ++stp) 6585 if (stp->st_wordlen == goodlen 6586 && stp->st_orglen == badlen 6587 && STRNCMP(stp->st_word, goodword, goodlen) == 0) 6588 { 6589 /* 6590 * Found it. Remember the word with the lowest score. 6591 */ 6592 if (stp->st_slang == NULL) 6593 stp->st_slang = slang; 6594 6595 new_sug.st_score = score; 6596 new_sug.st_altscore = altscore; 6597 new_sug.st_had_bonus = had_bonus; 6598 6599 if (stp->st_had_bonus != had_bonus) 6600 { 6601 /* Only one of the two had the soundalike score computed. 6602 * Need to do that for the other one now, otherwise the 6603 * scores can't be compared. This happens because 6604 * suggest_try_change() doesn't compute the soundalike 6605 * word to keep it fast, while some special methods set 6606 * the soundalike score to zero. */ 6607 if (had_bonus) 6608 rescore_one(su, stp); 6609 else 6610 { 6611 new_sug.st_word = stp->st_word; 6612 new_sug.st_wordlen = stp->st_wordlen; 6613 new_sug.st_slang = stp->st_slang; 6614 new_sug.st_orglen = badlen; 6615 rescore_one(su, &new_sug); 6616 } 6617 } 6618 6619 if (stp->st_score > new_sug.st_score) 6620 { 6621 stp->st_score = new_sug.st_score; 6622 stp->st_altscore = new_sug.st_altscore; 6623 stp->st_had_bonus = new_sug.st_had_bonus; 6624 } 6625 break; 6626 } 6627 } 6628 6629 if (i < 0 && ga_grow(gap, 1) == OK) 6630 { 6631 /* Add a suggestion. */ 6632 stp = &SUG(*gap, gap->ga_len); 6633 stp->st_word = vim_strnsave(goodword, goodlen); 6634 if (stp->st_word != NULL) 6635 { 6636 stp->st_wordlen = goodlen; 6637 stp->st_score = score; 6638 stp->st_altscore = altscore; 6639 stp->st_had_bonus = had_bonus; 6640 stp->st_orglen = badlen; 6641 stp->st_slang = slang; 6642 ++gap->ga_len; 6643 6644 /* If we have too many suggestions now, sort the list and keep 6645 * the best suggestions. */ 6646 if (gap->ga_len > SUG_MAX_COUNT(su)) 6647 { 6648 if (maxsf) 6649 su->su_sfmaxscore = cleanup_suggestions(gap, 6650 su->su_sfmaxscore, SUG_CLEAN_COUNT(su)); 6651 else 6652 su->su_maxscore = cleanup_suggestions(gap, 6653 su->su_maxscore, SUG_CLEAN_COUNT(su)); 6654 } 6655 } 6656 } 6657 } 6658 6659 /* 6660 * Suggestions may in fact be flagged as errors. Esp. for banned words and 6661 * for split words, such as "the the". Remove these from the list here. 6662 */ 6663 static void 6664 check_suggestions( 6665 suginfo_T *su, 6666 garray_T *gap) /* either su_ga or su_sga */ 6667 { 6668 suggest_T *stp; 6669 int i; 6670 char_u longword[MAXWLEN + 1]; 6671 int len; 6672 hlf_T attr; 6673 6674 stp = &SUG(*gap, 0); 6675 for (i = gap->ga_len - 1; i >= 0; --i) 6676 { 6677 /* Need to append what follows to check for "the the". */ 6678 vim_strncpy(longword, stp[i].st_word, MAXWLEN); 6679 len = stp[i].st_wordlen; 6680 vim_strncpy(longword + len, su->su_badptr + stp[i].st_orglen, 6681 MAXWLEN - len); 6682 attr = HLF_COUNT; 6683 (void)spell_check(curwin, longword, &attr, NULL, FALSE); 6684 if (attr != HLF_COUNT) 6685 { 6686 /* Remove this entry. */ 6687 vim_free(stp[i].st_word); 6688 --gap->ga_len; 6689 if (i < gap->ga_len) 6690 mch_memmove(stp + i, stp + i + 1, 6691 sizeof(suggest_T) * (gap->ga_len - i)); 6692 } 6693 } 6694 } 6695 6696 6697 /* 6698 * Add a word to be banned. 6699 */ 6700 static void 6701 add_banned( 6702 suginfo_T *su, 6703 char_u *word) 6704 { 6705 char_u *s; 6706 hash_T hash; 6707 hashitem_T *hi; 6708 6709 hash = hash_hash(word); 6710 hi = hash_lookup(&su->su_banned, word, hash); 6711 if (HASHITEM_EMPTY(hi)) 6712 { 6713 s = vim_strsave(word); 6714 if (s != NULL) 6715 hash_add_item(&su->su_banned, hi, s, hash); 6716 } 6717 } 6718 6719 /* 6720 * Recompute the score for all suggestions if sound-folding is possible. This 6721 * is slow, thus only done for the final results. 6722 */ 6723 static void 6724 rescore_suggestions(suginfo_T *su) 6725 { 6726 int i; 6727 6728 if (su->su_sallang != NULL) 6729 for (i = 0; i < su->su_ga.ga_len; ++i) 6730 rescore_one(su, &SUG(su->su_ga, i)); 6731 } 6732 6733 /* 6734 * Recompute the score for one suggestion if sound-folding is possible. 6735 */ 6736 static void 6737 rescore_one(suginfo_T *su, suggest_T *stp) 6738 { 6739 slang_T *slang = stp->st_slang; 6740 char_u sal_badword[MAXWLEN]; 6741 char_u *p; 6742 6743 /* Only rescore suggestions that have no sal score yet and do have a 6744 * language. */ 6745 if (slang != NULL && slang->sl_sal.ga_len > 0 && !stp->st_had_bonus) 6746 { 6747 if (slang == su->su_sallang) 6748 p = su->su_sal_badword; 6749 else 6750 { 6751 spell_soundfold(slang, su->su_fbadword, TRUE, sal_badword); 6752 p = sal_badword; 6753 } 6754 6755 stp->st_altscore = stp_sal_score(stp, su, slang, p); 6756 if (stp->st_altscore == SCORE_MAXMAX) 6757 stp->st_altscore = SCORE_BIG; 6758 stp->st_score = RESCORE(stp->st_score, stp->st_altscore); 6759 stp->st_had_bonus = TRUE; 6760 } 6761 } 6762 6763 static int sug_compare(const void *s1, const void *s2); 6764 6765 /* 6766 * Function given to qsort() to sort the suggestions on st_score. 6767 * First on "st_score", then "st_altscore" then alphabetically. 6768 */ 6769 static int 6770 sug_compare(const void *s1, const void *s2) 6771 { 6772 suggest_T *p1 = (suggest_T *)s1; 6773 suggest_T *p2 = (suggest_T *)s2; 6774 int n = p1->st_score - p2->st_score; 6775 6776 if (n == 0) 6777 { 6778 n = p1->st_altscore - p2->st_altscore; 6779 if (n == 0) 6780 n = STRICMP(p1->st_word, p2->st_word); 6781 } 6782 return n; 6783 } 6784 6785 /* 6786 * Cleanup the suggestions: 6787 * - Sort on score. 6788 * - Remove words that won't be displayed. 6789 * Returns the maximum score in the list or "maxscore" unmodified. 6790 */ 6791 static int 6792 cleanup_suggestions( 6793 garray_T *gap, 6794 int maxscore, 6795 int keep) /* nr of suggestions to keep */ 6796 { 6797 suggest_T *stp = &SUG(*gap, 0); 6798 int i; 6799 6800 /* Sort the list. */ 6801 qsort(gap->ga_data, (size_t)gap->ga_len, sizeof(suggest_T), sug_compare); 6802 6803 /* Truncate the list to the number of suggestions that will be displayed. */ 6804 if (gap->ga_len > keep) 6805 { 6806 for (i = keep; i < gap->ga_len; ++i) 6807 vim_free(stp[i].st_word); 6808 gap->ga_len = keep; 6809 return stp[keep - 1].st_score; 6810 } 6811 return maxscore; 6812 } 6813 6814 #if defined(FEAT_EVAL) || defined(PROTO) 6815 /* 6816 * Soundfold a string, for soundfold(). 6817 * Result is in allocated memory, NULL for an error. 6818 */ 6819 char_u * 6820 eval_soundfold(char_u *word) 6821 { 6822 langp_T *lp; 6823 char_u sound[MAXWLEN]; 6824 int lpi; 6825 6826 if (curwin->w_p_spell && *curwin->w_s->b_p_spl != NUL) 6827 /* Use the sound-folding of the first language that supports it. */ 6828 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 6829 { 6830 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 6831 if (lp->lp_slang->sl_sal.ga_len > 0) 6832 { 6833 /* soundfold the word */ 6834 spell_soundfold(lp->lp_slang, word, FALSE, sound); 6835 return vim_strsave(sound); 6836 } 6837 } 6838 6839 /* No language with sound folding, return word as-is. */ 6840 return vim_strsave(word); 6841 } 6842 #endif 6843 6844 /* 6845 * Turn "inword" into its sound-a-like equivalent in "res[MAXWLEN]". 6846 * 6847 * There are many ways to turn a word into a sound-a-like representation. The 6848 * oldest is Soundex (1918!). A nice overview can be found in "Approximate 6849 * swedish name matching - survey and test of different algorithms" by Klas 6850 * Erikson. 6851 * 6852 * We support two methods: 6853 * 1. SOFOFROM/SOFOTO do a simple character mapping. 6854 * 2. SAL items define a more advanced sound-folding (and much slower). 6855 */ 6856 void 6857 spell_soundfold( 6858 slang_T *slang, 6859 char_u *inword, 6860 int folded, /* "inword" is already case-folded */ 6861 char_u *res) 6862 { 6863 char_u fword[MAXWLEN]; 6864 char_u *word; 6865 6866 if (slang->sl_sofo) 6867 /* SOFOFROM and SOFOTO used */ 6868 spell_soundfold_sofo(slang, inword, res); 6869 else 6870 { 6871 /* SAL items used. Requires the word to be case-folded. */ 6872 if (folded) 6873 word = inword; 6874 else 6875 { 6876 (void)spell_casefold(inword, (int)STRLEN(inword), fword, MAXWLEN); 6877 word = fword; 6878 } 6879 6880 if (has_mbyte) 6881 spell_soundfold_wsal(slang, word, res); 6882 else 6883 spell_soundfold_sal(slang, word, res); 6884 } 6885 } 6886 6887 /* 6888 * Perform sound folding of "inword" into "res" according to SOFOFROM and 6889 * SOFOTO lines. 6890 */ 6891 static void 6892 spell_soundfold_sofo(slang_T *slang, char_u *inword, char_u *res) 6893 { 6894 char_u *s; 6895 int ri = 0; 6896 int c; 6897 6898 if (has_mbyte) 6899 { 6900 int prevc = 0; 6901 int *ip; 6902 6903 /* The sl_sal_first[] table contains the translation for chars up to 6904 * 255, sl_sal the rest. */ 6905 for (s = inword; *s != NUL; ) 6906 { 6907 c = mb_cptr2char_adv(&s); 6908 if (enc_utf8 ? utf_class(c) == 0 : VIM_ISWHITE(c)) 6909 c = ' '; 6910 else if (c < 256) 6911 c = slang->sl_sal_first[c]; 6912 else 6913 { 6914 ip = ((int **)slang->sl_sal.ga_data)[c & 0xff]; 6915 if (ip == NULL) /* empty list, can't match */ 6916 c = NUL; 6917 else 6918 for (;;) /* find "c" in the list */ 6919 { 6920 if (*ip == 0) /* not found */ 6921 { 6922 c = NUL; 6923 break; 6924 } 6925 if (*ip == c) /* match! */ 6926 { 6927 c = ip[1]; 6928 break; 6929 } 6930 ip += 2; 6931 } 6932 } 6933 6934 if (c != NUL && c != prevc) 6935 { 6936 ri += mb_char2bytes(c, res + ri); 6937 if (ri + MB_MAXBYTES > MAXWLEN) 6938 break; 6939 prevc = c; 6940 } 6941 } 6942 } 6943 else 6944 { 6945 /* The sl_sal_first[] table contains the translation. */ 6946 for (s = inword; (c = *s) != NUL; ++s) 6947 { 6948 if (VIM_ISWHITE(c)) 6949 c = ' '; 6950 else 6951 c = slang->sl_sal_first[c]; 6952 if (c != NUL && (ri == 0 || res[ri - 1] != c)) 6953 res[ri++] = c; 6954 } 6955 } 6956 6957 res[ri] = NUL; 6958 } 6959 6960 static void 6961 spell_soundfold_sal(slang_T *slang, char_u *inword, char_u *res) 6962 { 6963 salitem_T *smp; 6964 char_u word[MAXWLEN]; 6965 char_u *s = inword; 6966 char_u *t; 6967 char_u *pf; 6968 int i, j, z; 6969 int reslen; 6970 int n, k = 0; 6971 int z0; 6972 int k0; 6973 int n0; 6974 int c; 6975 int pri; 6976 int p0 = -333; 6977 int c0; 6978 6979 /* Remove accents, if wanted. We actually remove all non-word characters. 6980 * But keep white space. We need a copy, the word may be changed here. */ 6981 if (slang->sl_rem_accents) 6982 { 6983 t = word; 6984 while (*s != NUL) 6985 { 6986 if (VIM_ISWHITE(*s)) 6987 { 6988 *t++ = ' '; 6989 s = skipwhite(s); 6990 } 6991 else 6992 { 6993 if (spell_iswordp_nmw(s, curwin)) 6994 *t++ = *s; 6995 ++s; 6996 } 6997 } 6998 *t = NUL; 6999 } 7000 else 7001 vim_strncpy(word, s, MAXWLEN - 1); 7002 7003 smp = (salitem_T *)slang->sl_sal.ga_data; 7004 7005 /* 7006 * This comes from Aspell phonet.cpp. Converted from C++ to C. 7007 * Changed to keep spaces. 7008 */ 7009 i = reslen = z = 0; 7010 while ((c = word[i]) != NUL) 7011 { 7012 /* Start with the first rule that has the character in the word. */ 7013 n = slang->sl_sal_first[c]; 7014 z0 = 0; 7015 7016 if (n >= 0) 7017 { 7018 /* check all rules for the same letter */ 7019 for (; (s = smp[n].sm_lead)[0] == c; ++n) 7020 { 7021 /* Quickly skip entries that don't match the word. Most 7022 * entries are less then three chars, optimize for that. */ 7023 k = smp[n].sm_leadlen; 7024 if (k > 1) 7025 { 7026 if (word[i + 1] != s[1]) 7027 continue; 7028 if (k > 2) 7029 { 7030 for (j = 2; j < k; ++j) 7031 if (word[i + j] != s[j]) 7032 break; 7033 if (j < k) 7034 continue; 7035 } 7036 } 7037 7038 if ((pf = smp[n].sm_oneof) != NULL) 7039 { 7040 /* Check for match with one of the chars in "sm_oneof". */ 7041 while (*pf != NUL && *pf != word[i + k]) 7042 ++pf; 7043 if (*pf == NUL) 7044 continue; 7045 ++k; 7046 } 7047 s = smp[n].sm_rules; 7048 pri = 5; /* default priority */ 7049 7050 p0 = *s; 7051 k0 = k; 7052 while (*s == '-' && k > 1) 7053 { 7054 k--; 7055 s++; 7056 } 7057 if (*s == '<') 7058 s++; 7059 if (VIM_ISDIGIT(*s)) 7060 { 7061 /* determine priority */ 7062 pri = *s - '0'; 7063 s++; 7064 } 7065 if (*s == '^' && *(s + 1) == '^') 7066 s++; 7067 7068 if (*s == NUL 7069 || (*s == '^' 7070 && (i == 0 || !(word[i - 1] == ' ' 7071 || spell_iswordp(word + i - 1, curwin))) 7072 && (*(s + 1) != '$' 7073 || (!spell_iswordp(word + i + k0, curwin)))) 7074 || (*s == '$' && i > 0 7075 && spell_iswordp(word + i - 1, curwin) 7076 && (!spell_iswordp(word + i + k0, curwin)))) 7077 { 7078 /* search for followup rules, if: */ 7079 /* followup and k > 1 and NO '-' in searchstring */ 7080 c0 = word[i + k - 1]; 7081 n0 = slang->sl_sal_first[c0]; 7082 7083 if (slang->sl_followup && k > 1 && n0 >= 0 7084 && p0 != '-' && word[i + k] != NUL) 7085 { 7086 /* test follow-up rule for "word[i + k]" */ 7087 for ( ; (s = smp[n0].sm_lead)[0] == c0; ++n0) 7088 { 7089 /* Quickly skip entries that don't match the word. 7090 * */ 7091 k0 = smp[n0].sm_leadlen; 7092 if (k0 > 1) 7093 { 7094 if (word[i + k] != s[1]) 7095 continue; 7096 if (k0 > 2) 7097 { 7098 pf = word + i + k + 1; 7099 for (j = 2; j < k0; ++j) 7100 if (*pf++ != s[j]) 7101 break; 7102 if (j < k0) 7103 continue; 7104 } 7105 } 7106 k0 += k - 1; 7107 7108 if ((pf = smp[n0].sm_oneof) != NULL) 7109 { 7110 /* Check for match with one of the chars in 7111 * "sm_oneof". */ 7112 while (*pf != NUL && *pf != word[i + k0]) 7113 ++pf; 7114 if (*pf == NUL) 7115 continue; 7116 ++k0; 7117 } 7118 7119 p0 = 5; 7120 s = smp[n0].sm_rules; 7121 while (*s == '-') 7122 { 7123 /* "k0" gets NOT reduced because 7124 * "if (k0 == k)" */ 7125 s++; 7126 } 7127 if (*s == '<') 7128 s++; 7129 if (VIM_ISDIGIT(*s)) 7130 { 7131 p0 = *s - '0'; 7132 s++; 7133 } 7134 7135 if (*s == NUL 7136 /* *s == '^' cuts */ 7137 || (*s == '$' 7138 && !spell_iswordp(word + i + k0, 7139 curwin))) 7140 { 7141 if (k0 == k) 7142 /* this is just a piece of the string */ 7143 continue; 7144 7145 if (p0 < pri) 7146 /* priority too low */ 7147 continue; 7148 /* rule fits; stop search */ 7149 break; 7150 } 7151 } 7152 7153 if (p0 >= pri && smp[n0].sm_lead[0] == c0) 7154 continue; 7155 } 7156 7157 /* replace string */ 7158 s = smp[n].sm_to; 7159 if (s == NULL) 7160 s = (char_u *)""; 7161 pf = smp[n].sm_rules; 7162 p0 = (vim_strchr(pf, '<') != NULL) ? 1 : 0; 7163 if (p0 == 1 && z == 0) 7164 { 7165 /* rule with '<' is used */ 7166 if (reslen > 0 && *s != NUL && (res[reslen - 1] == c 7167 || res[reslen - 1] == *s)) 7168 reslen--; 7169 z0 = 1; 7170 z = 1; 7171 k0 = 0; 7172 while (*s != NUL && word[i + k0] != NUL) 7173 { 7174 word[i + k0] = *s; 7175 k0++; 7176 s++; 7177 } 7178 if (k > k0) 7179 STRMOVE(word + i + k0, word + i + k); 7180 7181 /* new "actual letter" */ 7182 c = word[i]; 7183 } 7184 else 7185 { 7186 /* no '<' rule used */ 7187 i += k - 1; 7188 z = 0; 7189 while (*s != NUL && s[1] != NUL && reslen < MAXWLEN) 7190 { 7191 if (reslen == 0 || res[reslen - 1] != *s) 7192 res[reslen++] = *s; 7193 s++; 7194 } 7195 /* new "actual letter" */ 7196 c = *s; 7197 if (strstr((char *)pf, "^^") != NULL) 7198 { 7199 if (c != NUL) 7200 res[reslen++] = c; 7201 STRMOVE(word, word + i + 1); 7202 i = 0; 7203 z0 = 1; 7204 } 7205 } 7206 break; 7207 } 7208 } 7209 } 7210 else if (VIM_ISWHITE(c)) 7211 { 7212 c = ' '; 7213 k = 1; 7214 } 7215 7216 if (z0 == 0) 7217 { 7218 if (k && !p0 && reslen < MAXWLEN && c != NUL 7219 && (!slang->sl_collapse || reslen == 0 7220 || res[reslen - 1] != c)) 7221 /* condense only double letters */ 7222 res[reslen++] = c; 7223 7224 i++; 7225 z = 0; 7226 k = 0; 7227 } 7228 } 7229 7230 res[reslen] = NUL; 7231 } 7232 7233 /* 7234 * Turn "inword" into its sound-a-like equivalent in "res[MAXWLEN]". 7235 * Multi-byte version of spell_soundfold(). 7236 */ 7237 static void 7238 spell_soundfold_wsal(slang_T *slang, char_u *inword, char_u *res) 7239 { 7240 salitem_T *smp = (salitem_T *)slang->sl_sal.ga_data; 7241 int word[MAXWLEN]; 7242 int wres[MAXWLEN]; 7243 int l; 7244 char_u *s; 7245 int *ws; 7246 char_u *t; 7247 int *pf; 7248 int i, j, z; 7249 int reslen; 7250 int n, k = 0; 7251 int z0; 7252 int k0; 7253 int n0; 7254 int c; 7255 int pri; 7256 int p0 = -333; 7257 int c0; 7258 int did_white = FALSE; 7259 int wordlen; 7260 7261 7262 /* 7263 * Convert the multi-byte string to a wide-character string. 7264 * Remove accents, if wanted. We actually remove all non-word characters. 7265 * But keep white space. 7266 */ 7267 wordlen = 0; 7268 for (s = inword; *s != NUL; ) 7269 { 7270 t = s; 7271 c = mb_cptr2char_adv(&s); 7272 if (slang->sl_rem_accents) 7273 { 7274 if (enc_utf8 ? utf_class(c) == 0 : VIM_ISWHITE(c)) 7275 { 7276 if (did_white) 7277 continue; 7278 c = ' '; 7279 did_white = TRUE; 7280 } 7281 else 7282 { 7283 did_white = FALSE; 7284 if (!spell_iswordp_nmw(t, curwin)) 7285 continue; 7286 } 7287 } 7288 word[wordlen++] = c; 7289 } 7290 word[wordlen] = NUL; 7291 7292 /* 7293 * This algorithm comes from Aspell phonet.cpp. 7294 * Converted from C++ to C. Added support for multi-byte chars. 7295 * Changed to keep spaces. 7296 */ 7297 i = reslen = z = 0; 7298 while ((c = word[i]) != NUL) 7299 { 7300 /* Start with the first rule that has the character in the word. */ 7301 n = slang->sl_sal_first[c & 0xff]; 7302 z0 = 0; 7303 7304 if (n >= 0) 7305 { 7306 /* Check all rules for the same index byte. 7307 * If c is 0x300 need extra check for the end of the array, as 7308 * (c & 0xff) is NUL. */ 7309 for (; ((ws = smp[n].sm_lead_w)[0] & 0xff) == (c & 0xff) 7310 && ws[0] != NUL; ++n) 7311 { 7312 /* Quickly skip entries that don't match the word. Most 7313 * entries are less then three chars, optimize for that. */ 7314 if (c != ws[0]) 7315 continue; 7316 k = smp[n].sm_leadlen; 7317 if (k > 1) 7318 { 7319 if (word[i + 1] != ws[1]) 7320 continue; 7321 if (k > 2) 7322 { 7323 for (j = 2; j < k; ++j) 7324 if (word[i + j] != ws[j]) 7325 break; 7326 if (j < k) 7327 continue; 7328 } 7329 } 7330 7331 if ((pf = smp[n].sm_oneof_w) != NULL) 7332 { 7333 /* Check for match with one of the chars in "sm_oneof". */ 7334 while (*pf != NUL && *pf != word[i + k]) 7335 ++pf; 7336 if (*pf == NUL) 7337 continue; 7338 ++k; 7339 } 7340 s = smp[n].sm_rules; 7341 pri = 5; /* default priority */ 7342 7343 p0 = *s; 7344 k0 = k; 7345 while (*s == '-' && k > 1) 7346 { 7347 k--; 7348 s++; 7349 } 7350 if (*s == '<') 7351 s++; 7352 if (VIM_ISDIGIT(*s)) 7353 { 7354 /* determine priority */ 7355 pri = *s - '0'; 7356 s++; 7357 } 7358 if (*s == '^' && *(s + 1) == '^') 7359 s++; 7360 7361 if (*s == NUL 7362 || (*s == '^' 7363 && (i == 0 || !(word[i - 1] == ' ' 7364 || spell_iswordp_w(word + i - 1, curwin))) 7365 && (*(s + 1) != '$' 7366 || (!spell_iswordp_w(word + i + k0, curwin)))) 7367 || (*s == '$' && i > 0 7368 && spell_iswordp_w(word + i - 1, curwin) 7369 && (!spell_iswordp_w(word + i + k0, curwin)))) 7370 { 7371 /* search for followup rules, if: */ 7372 /* followup and k > 1 and NO '-' in searchstring */ 7373 c0 = word[i + k - 1]; 7374 n0 = slang->sl_sal_first[c0 & 0xff]; 7375 7376 if (slang->sl_followup && k > 1 && n0 >= 0 7377 && p0 != '-' && word[i + k] != NUL) 7378 { 7379 /* Test follow-up rule for "word[i + k]"; loop over 7380 * all entries with the same index byte. */ 7381 for ( ; ((ws = smp[n0].sm_lead_w)[0] & 0xff) 7382 == (c0 & 0xff); ++n0) 7383 { 7384 /* Quickly skip entries that don't match the word. 7385 */ 7386 if (c0 != ws[0]) 7387 continue; 7388 k0 = smp[n0].sm_leadlen; 7389 if (k0 > 1) 7390 { 7391 if (word[i + k] != ws[1]) 7392 continue; 7393 if (k0 > 2) 7394 { 7395 pf = word + i + k + 1; 7396 for (j = 2; j < k0; ++j) 7397 if (*pf++ != ws[j]) 7398 break; 7399 if (j < k0) 7400 continue; 7401 } 7402 } 7403 k0 += k - 1; 7404 7405 if ((pf = smp[n0].sm_oneof_w) != NULL) 7406 { 7407 /* Check for match with one of the chars in 7408 * "sm_oneof". */ 7409 while (*pf != NUL && *pf != word[i + k0]) 7410 ++pf; 7411 if (*pf == NUL) 7412 continue; 7413 ++k0; 7414 } 7415 7416 p0 = 5; 7417 s = smp[n0].sm_rules; 7418 while (*s == '-') 7419 { 7420 /* "k0" gets NOT reduced because 7421 * "if (k0 == k)" */ 7422 s++; 7423 } 7424 if (*s == '<') 7425 s++; 7426 if (VIM_ISDIGIT(*s)) 7427 { 7428 p0 = *s - '0'; 7429 s++; 7430 } 7431 7432 if (*s == NUL 7433 /* *s == '^' cuts */ 7434 || (*s == '$' 7435 && !spell_iswordp_w(word + i + k0, 7436 curwin))) 7437 { 7438 if (k0 == k) 7439 /* this is just a piece of the string */ 7440 continue; 7441 7442 if (p0 < pri) 7443 /* priority too low */ 7444 continue; 7445 /* rule fits; stop search */ 7446 break; 7447 } 7448 } 7449 7450 if (p0 >= pri && (smp[n0].sm_lead_w[0] & 0xff) 7451 == (c0 & 0xff)) 7452 continue; 7453 } 7454 7455 /* replace string */ 7456 ws = smp[n].sm_to_w; 7457 s = smp[n].sm_rules; 7458 p0 = (vim_strchr(s, '<') != NULL) ? 1 : 0; 7459 if (p0 == 1 && z == 0) 7460 { 7461 /* rule with '<' is used */ 7462 if (reslen > 0 && ws != NULL && *ws != NUL 7463 && (wres[reslen - 1] == c 7464 || wres[reslen - 1] == *ws)) 7465 reslen--; 7466 z0 = 1; 7467 z = 1; 7468 k0 = 0; 7469 if (ws != NULL) 7470 while (*ws != NUL && word[i + k0] != NUL) 7471 { 7472 word[i + k0] = *ws; 7473 k0++; 7474 ws++; 7475 } 7476 if (k > k0) 7477 mch_memmove(word + i + k0, word + i + k, 7478 sizeof(int) * (wordlen - (i + k) + 1)); 7479 7480 /* new "actual letter" */ 7481 c = word[i]; 7482 } 7483 else 7484 { 7485 /* no '<' rule used */ 7486 i += k - 1; 7487 z = 0; 7488 if (ws != NULL) 7489 while (*ws != NUL && ws[1] != NUL 7490 && reslen < MAXWLEN) 7491 { 7492 if (reslen == 0 || wres[reslen - 1] != *ws) 7493 wres[reslen++] = *ws; 7494 ws++; 7495 } 7496 /* new "actual letter" */ 7497 if (ws == NULL) 7498 c = NUL; 7499 else 7500 c = *ws; 7501 if (strstr((char *)s, "^^") != NULL) 7502 { 7503 if (c != NUL) 7504 wres[reslen++] = c; 7505 mch_memmove(word, word + i + 1, 7506 sizeof(int) * (wordlen - (i + 1) + 1)); 7507 i = 0; 7508 z0 = 1; 7509 } 7510 } 7511 break; 7512 } 7513 } 7514 } 7515 else if (VIM_ISWHITE(c)) 7516 { 7517 c = ' '; 7518 k = 1; 7519 } 7520 7521 if (z0 == 0) 7522 { 7523 if (k && !p0 && reslen < MAXWLEN && c != NUL 7524 && (!slang->sl_collapse || reslen == 0 7525 || wres[reslen - 1] != c)) 7526 /* condense only double letters */ 7527 wres[reslen++] = c; 7528 7529 i++; 7530 z = 0; 7531 k = 0; 7532 } 7533 } 7534 7535 /* Convert wide characters in "wres" to a multi-byte string in "res". */ 7536 l = 0; 7537 for (n = 0; n < reslen; ++n) 7538 { 7539 l += mb_char2bytes(wres[n], res + l); 7540 if (l + MB_MAXBYTES > MAXWLEN) 7541 break; 7542 } 7543 res[l] = NUL; 7544 } 7545 7546 /* 7547 * Compute a score for two sound-a-like words. 7548 * This permits up to two inserts/deletes/swaps/etc. to keep things fast. 7549 * Instead of a generic loop we write out the code. That keeps it fast by 7550 * avoiding checks that will not be possible. 7551 */ 7552 static int 7553 soundalike_score( 7554 char_u *goodstart, /* sound-folded good word */ 7555 char_u *badstart) /* sound-folded bad word */ 7556 { 7557 char_u *goodsound = goodstart; 7558 char_u *badsound = badstart; 7559 int goodlen; 7560 int badlen; 7561 int n; 7562 char_u *pl, *ps; 7563 char_u *pl2, *ps2; 7564 int score = 0; 7565 7566 /* Adding/inserting "*" at the start (word starts with vowel) shouldn't be 7567 * counted so much, vowels halfway the word aren't counted at all. */ 7568 if ((*badsound == '*' || *goodsound == '*') && *badsound != *goodsound) 7569 { 7570 if ((badsound[0] == NUL && goodsound[1] == NUL) 7571 || (goodsound[0] == NUL && badsound[1] == NUL)) 7572 /* changing word with vowel to word without a sound */ 7573 return SCORE_DEL; 7574 if (badsound[0] == NUL || goodsound[0] == NUL) 7575 /* more than two changes */ 7576 return SCORE_MAXMAX; 7577 7578 if (badsound[1] == goodsound[1] 7579 || (badsound[1] != NUL 7580 && goodsound[1] != NUL 7581 && badsound[2] == goodsound[2])) 7582 { 7583 /* handle like a substitute */ 7584 } 7585 else 7586 { 7587 score = 2 * SCORE_DEL / 3; 7588 if (*badsound == '*') 7589 ++badsound; 7590 else 7591 ++goodsound; 7592 } 7593 } 7594 7595 goodlen = (int)STRLEN(goodsound); 7596 badlen = (int)STRLEN(badsound); 7597 7598 /* Return quickly if the lengths are too different to be fixed by two 7599 * changes. */ 7600 n = goodlen - badlen; 7601 if (n < -2 || n > 2) 7602 return SCORE_MAXMAX; 7603 7604 if (n > 0) 7605 { 7606 pl = goodsound; /* goodsound is longest */ 7607 ps = badsound; 7608 } 7609 else 7610 { 7611 pl = badsound; /* badsound is longest */ 7612 ps = goodsound; 7613 } 7614 7615 /* Skip over the identical part. */ 7616 while (*pl == *ps && *pl != NUL) 7617 { 7618 ++pl; 7619 ++ps; 7620 } 7621 7622 switch (n) 7623 { 7624 case -2: 7625 case 2: 7626 /* 7627 * Must delete two characters from "pl". 7628 */ 7629 ++pl; /* first delete */ 7630 while (*pl == *ps) 7631 { 7632 ++pl; 7633 ++ps; 7634 } 7635 /* strings must be equal after second delete */ 7636 if (STRCMP(pl + 1, ps) == 0) 7637 return score + SCORE_DEL * 2; 7638 7639 /* Failed to compare. */ 7640 break; 7641 7642 case -1: 7643 case 1: 7644 /* 7645 * Minimal one delete from "pl" required. 7646 */ 7647 7648 /* 1: delete */ 7649 pl2 = pl + 1; 7650 ps2 = ps; 7651 while (*pl2 == *ps2) 7652 { 7653 if (*pl2 == NUL) /* reached the end */ 7654 return score + SCORE_DEL; 7655 ++pl2; 7656 ++ps2; 7657 } 7658 7659 /* 2: delete then swap, then rest must be equal */ 7660 if (pl2[0] == ps2[1] && pl2[1] == ps2[0] 7661 && STRCMP(pl2 + 2, ps2 + 2) == 0) 7662 return score + SCORE_DEL + SCORE_SWAP; 7663 7664 /* 3: delete then substitute, then the rest must be equal */ 7665 if (STRCMP(pl2 + 1, ps2 + 1) == 0) 7666 return score + SCORE_DEL + SCORE_SUBST; 7667 7668 /* 4: first swap then delete */ 7669 if (pl[0] == ps[1] && pl[1] == ps[0]) 7670 { 7671 pl2 = pl + 2; /* swap, skip two chars */ 7672 ps2 = ps + 2; 7673 while (*pl2 == *ps2) 7674 { 7675 ++pl2; 7676 ++ps2; 7677 } 7678 /* delete a char and then strings must be equal */ 7679 if (STRCMP(pl2 + 1, ps2) == 0) 7680 return score + SCORE_SWAP + SCORE_DEL; 7681 } 7682 7683 /* 5: first substitute then delete */ 7684 pl2 = pl + 1; /* substitute, skip one char */ 7685 ps2 = ps + 1; 7686 while (*pl2 == *ps2) 7687 { 7688 ++pl2; 7689 ++ps2; 7690 } 7691 /* delete a char and then strings must be equal */ 7692 if (STRCMP(pl2 + 1, ps2) == 0) 7693 return score + SCORE_SUBST + SCORE_DEL; 7694 7695 /* Failed to compare. */ 7696 break; 7697 7698 case 0: 7699 /* 7700 * Lengths are equal, thus changes must result in same length: An 7701 * insert is only possible in combination with a delete. 7702 * 1: check if for identical strings 7703 */ 7704 if (*pl == NUL) 7705 return score; 7706 7707 /* 2: swap */ 7708 if (pl[0] == ps[1] && pl[1] == ps[0]) 7709 { 7710 pl2 = pl + 2; /* swap, skip two chars */ 7711 ps2 = ps + 2; 7712 while (*pl2 == *ps2) 7713 { 7714 if (*pl2 == NUL) /* reached the end */ 7715 return score + SCORE_SWAP; 7716 ++pl2; 7717 ++ps2; 7718 } 7719 /* 3: swap and swap again */ 7720 if (pl2[0] == ps2[1] && pl2[1] == ps2[0] 7721 && STRCMP(pl2 + 2, ps2 + 2) == 0) 7722 return score + SCORE_SWAP + SCORE_SWAP; 7723 7724 /* 4: swap and substitute */ 7725 if (STRCMP(pl2 + 1, ps2 + 1) == 0) 7726 return score + SCORE_SWAP + SCORE_SUBST; 7727 } 7728 7729 /* 5: substitute */ 7730 pl2 = pl + 1; 7731 ps2 = ps + 1; 7732 while (*pl2 == *ps2) 7733 { 7734 if (*pl2 == NUL) /* reached the end */ 7735 return score + SCORE_SUBST; 7736 ++pl2; 7737 ++ps2; 7738 } 7739 7740 /* 6: substitute and swap */ 7741 if (pl2[0] == ps2[1] && pl2[1] == ps2[0] 7742 && STRCMP(pl2 + 2, ps2 + 2) == 0) 7743 return score + SCORE_SUBST + SCORE_SWAP; 7744 7745 /* 7: substitute and substitute */ 7746 if (STRCMP(pl2 + 1, ps2 + 1) == 0) 7747 return score + SCORE_SUBST + SCORE_SUBST; 7748 7749 /* 8: insert then delete */ 7750 pl2 = pl; 7751 ps2 = ps + 1; 7752 while (*pl2 == *ps2) 7753 { 7754 ++pl2; 7755 ++ps2; 7756 } 7757 if (STRCMP(pl2 + 1, ps2) == 0) 7758 return score + SCORE_INS + SCORE_DEL; 7759 7760 /* 9: delete then insert */ 7761 pl2 = pl + 1; 7762 ps2 = ps; 7763 while (*pl2 == *ps2) 7764 { 7765 ++pl2; 7766 ++ps2; 7767 } 7768 if (STRCMP(pl2, ps2 + 1) == 0) 7769 return score + SCORE_INS + SCORE_DEL; 7770 7771 /* Failed to compare. */ 7772 break; 7773 } 7774 7775 return SCORE_MAXMAX; 7776 } 7777 7778 /* 7779 * Compute the "edit distance" to turn "badword" into "goodword". The less 7780 * deletes/inserts/substitutes/swaps are required the lower the score. 7781 * 7782 * The algorithm is described by Du and Chang, 1992. 7783 * The implementation of the algorithm comes from Aspell editdist.cpp, 7784 * edit_distance(). It has been converted from C++ to C and modified to 7785 * support multi-byte characters. 7786 */ 7787 static int 7788 spell_edit_score( 7789 slang_T *slang, 7790 char_u *badword, 7791 char_u *goodword) 7792 { 7793 int *cnt; 7794 int badlen, goodlen; /* lengths including NUL */ 7795 int j, i; 7796 int t; 7797 int bc, gc; 7798 int pbc, pgc; 7799 char_u *p; 7800 int wbadword[MAXWLEN]; 7801 int wgoodword[MAXWLEN]; 7802 7803 if (has_mbyte) 7804 { 7805 /* Get the characters from the multi-byte strings and put them in an 7806 * int array for easy access. */ 7807 for (p = badword, badlen = 0; *p != NUL; ) 7808 wbadword[badlen++] = mb_cptr2char_adv(&p); 7809 wbadword[badlen++] = 0; 7810 for (p = goodword, goodlen = 0; *p != NUL; ) 7811 wgoodword[goodlen++] = mb_cptr2char_adv(&p); 7812 wgoodword[goodlen++] = 0; 7813 } 7814 else 7815 { 7816 badlen = (int)STRLEN(badword) + 1; 7817 goodlen = (int)STRLEN(goodword) + 1; 7818 } 7819 7820 /* We use "cnt" as an array: CNT(badword_idx, goodword_idx). */ 7821 #define CNT(a, b) cnt[(a) + (b) * (badlen + 1)] 7822 cnt = (int *)lalloc((long_u)(sizeof(int) * (badlen + 1) * (goodlen + 1)), 7823 TRUE); 7824 if (cnt == NULL) 7825 return 0; /* out of memory */ 7826 7827 CNT(0, 0) = 0; 7828 for (j = 1; j <= goodlen; ++j) 7829 CNT(0, j) = CNT(0, j - 1) + SCORE_INS; 7830 7831 for (i = 1; i <= badlen; ++i) 7832 { 7833 CNT(i, 0) = CNT(i - 1, 0) + SCORE_DEL; 7834 for (j = 1; j <= goodlen; ++j) 7835 { 7836 if (has_mbyte) 7837 { 7838 bc = wbadword[i - 1]; 7839 gc = wgoodword[j - 1]; 7840 } 7841 else 7842 { 7843 bc = badword[i - 1]; 7844 gc = goodword[j - 1]; 7845 } 7846 if (bc == gc) 7847 CNT(i, j) = CNT(i - 1, j - 1); 7848 else 7849 { 7850 /* Use a better score when there is only a case difference. */ 7851 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc)) 7852 CNT(i, j) = SCORE_ICASE + CNT(i - 1, j - 1); 7853 else 7854 { 7855 /* For a similar character use SCORE_SIMILAR. */ 7856 if (slang != NULL 7857 && slang->sl_has_map 7858 && similar_chars(slang, gc, bc)) 7859 CNT(i, j) = SCORE_SIMILAR + CNT(i - 1, j - 1); 7860 else 7861 CNT(i, j) = SCORE_SUBST + CNT(i - 1, j - 1); 7862 } 7863 7864 if (i > 1 && j > 1) 7865 { 7866 if (has_mbyte) 7867 { 7868 pbc = wbadword[i - 2]; 7869 pgc = wgoodword[j - 2]; 7870 } 7871 else 7872 { 7873 pbc = badword[i - 2]; 7874 pgc = goodword[j - 2]; 7875 } 7876 if (bc == pgc && pbc == gc) 7877 { 7878 t = SCORE_SWAP + CNT(i - 2, j - 2); 7879 if (t < CNT(i, j)) 7880 CNT(i, j) = t; 7881 } 7882 } 7883 t = SCORE_DEL + CNT(i - 1, j); 7884 if (t < CNT(i, j)) 7885 CNT(i, j) = t; 7886 t = SCORE_INS + CNT(i, j - 1); 7887 if (t < CNT(i, j)) 7888 CNT(i, j) = t; 7889 } 7890 } 7891 } 7892 7893 i = CNT(badlen - 1, goodlen - 1); 7894 vim_free(cnt); 7895 return i; 7896 } 7897 7898 typedef struct 7899 { 7900 int badi; 7901 int goodi; 7902 int score; 7903 } limitscore_T; 7904 7905 /* 7906 * Like spell_edit_score(), but with a limit on the score to make it faster. 7907 * May return SCORE_MAXMAX when the score is higher than "limit". 7908 * 7909 * This uses a stack for the edits still to be tried. 7910 * The idea comes from Aspell leditdist.cpp. Rewritten in C and added support 7911 * for multi-byte characters. 7912 */ 7913 static int 7914 spell_edit_score_limit( 7915 slang_T *slang, 7916 char_u *badword, 7917 char_u *goodword, 7918 int limit) 7919 { 7920 limitscore_T stack[10]; /* allow for over 3 * 2 edits */ 7921 int stackidx; 7922 int bi, gi; 7923 int bi2, gi2; 7924 int bc, gc; 7925 int score; 7926 int score_off; 7927 int minscore; 7928 int round; 7929 7930 /* Multi-byte characters require a bit more work, use a different function 7931 * to avoid testing "has_mbyte" quite often. */ 7932 if (has_mbyte) 7933 return spell_edit_score_limit_w(slang, badword, goodword, limit); 7934 7935 /* 7936 * The idea is to go from start to end over the words. So long as 7937 * characters are equal just continue, this always gives the lowest score. 7938 * When there is a difference try several alternatives. Each alternative 7939 * increases "score" for the edit distance. Some of the alternatives are 7940 * pushed unto a stack and tried later, some are tried right away. At the 7941 * end of the word the score for one alternative is known. The lowest 7942 * possible score is stored in "minscore". 7943 */ 7944 stackidx = 0; 7945 bi = 0; 7946 gi = 0; 7947 score = 0; 7948 minscore = limit + 1; 7949 7950 for (;;) 7951 { 7952 /* Skip over an equal part, score remains the same. */ 7953 for (;;) 7954 { 7955 bc = badword[bi]; 7956 gc = goodword[gi]; 7957 if (bc != gc) /* stop at a char that's different */ 7958 break; 7959 if (bc == NUL) /* both words end */ 7960 { 7961 if (score < minscore) 7962 minscore = score; 7963 goto pop; /* do next alternative */ 7964 } 7965 ++bi; 7966 ++gi; 7967 } 7968 7969 if (gc == NUL) /* goodword ends, delete badword chars */ 7970 { 7971 do 7972 { 7973 if ((score += SCORE_DEL) >= minscore) 7974 goto pop; /* do next alternative */ 7975 } while (badword[++bi] != NUL); 7976 minscore = score; 7977 } 7978 else if (bc == NUL) /* badword ends, insert badword chars */ 7979 { 7980 do 7981 { 7982 if ((score += SCORE_INS) >= minscore) 7983 goto pop; /* do next alternative */ 7984 } while (goodword[++gi] != NUL); 7985 minscore = score; 7986 } 7987 else /* both words continue */ 7988 { 7989 /* If not close to the limit, perform a change. Only try changes 7990 * that may lead to a lower score than "minscore". 7991 * round 0: try deleting a char from badword 7992 * round 1: try inserting a char in badword */ 7993 for (round = 0; round <= 1; ++round) 7994 { 7995 score_off = score + (round == 0 ? SCORE_DEL : SCORE_INS); 7996 if (score_off < minscore) 7997 { 7998 if (score_off + SCORE_EDIT_MIN >= minscore) 7999 { 8000 /* Near the limit, rest of the words must match. We 8001 * can check that right now, no need to push an item 8002 * onto the stack. */ 8003 bi2 = bi + 1 - round; 8004 gi2 = gi + round; 8005 while (goodword[gi2] == badword[bi2]) 8006 { 8007 if (goodword[gi2] == NUL) 8008 { 8009 minscore = score_off; 8010 break; 8011 } 8012 ++bi2; 8013 ++gi2; 8014 } 8015 } 8016 else 8017 { 8018 /* try deleting/inserting a character later */ 8019 stack[stackidx].badi = bi + 1 - round; 8020 stack[stackidx].goodi = gi + round; 8021 stack[stackidx].score = score_off; 8022 ++stackidx; 8023 } 8024 } 8025 } 8026 8027 if (score + SCORE_SWAP < minscore) 8028 { 8029 /* If swapping two characters makes a match then the 8030 * substitution is more expensive, thus there is no need to 8031 * try both. */ 8032 if (gc == badword[bi + 1] && bc == goodword[gi + 1]) 8033 { 8034 /* Swap two characters, that is: skip them. */ 8035 gi += 2; 8036 bi += 2; 8037 score += SCORE_SWAP; 8038 continue; 8039 } 8040 } 8041 8042 /* Substitute one character for another which is the same 8043 * thing as deleting a character from both goodword and badword. 8044 * Use a better score when there is only a case difference. */ 8045 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc)) 8046 score += SCORE_ICASE; 8047 else 8048 { 8049 /* For a similar character use SCORE_SIMILAR. */ 8050 if (slang != NULL 8051 && slang->sl_has_map 8052 && similar_chars(slang, gc, bc)) 8053 score += SCORE_SIMILAR; 8054 else 8055 score += SCORE_SUBST; 8056 } 8057 8058 if (score < minscore) 8059 { 8060 /* Do the substitution. */ 8061 ++gi; 8062 ++bi; 8063 continue; 8064 } 8065 } 8066 pop: 8067 /* 8068 * Get here to try the next alternative, pop it from the stack. 8069 */ 8070 if (stackidx == 0) /* stack is empty, finished */ 8071 break; 8072 8073 /* pop an item from the stack */ 8074 --stackidx; 8075 gi = stack[stackidx].goodi; 8076 bi = stack[stackidx].badi; 8077 score = stack[stackidx].score; 8078 } 8079 8080 /* When the score goes over "limit" it may actually be much higher. 8081 * Return a very large number to avoid going below the limit when giving a 8082 * bonus. */ 8083 if (minscore > limit) 8084 return SCORE_MAXMAX; 8085 return minscore; 8086 } 8087 8088 /* 8089 * Multi-byte version of spell_edit_score_limit(). 8090 * Keep it in sync with the above! 8091 */ 8092 static int 8093 spell_edit_score_limit_w( 8094 slang_T *slang, 8095 char_u *badword, 8096 char_u *goodword, 8097 int limit) 8098 { 8099 limitscore_T stack[10]; /* allow for over 3 * 2 edits */ 8100 int stackidx; 8101 int bi, gi; 8102 int bi2, gi2; 8103 int bc, gc; 8104 int score; 8105 int score_off; 8106 int minscore; 8107 int round; 8108 char_u *p; 8109 int wbadword[MAXWLEN]; 8110 int wgoodword[MAXWLEN]; 8111 8112 /* Get the characters from the multi-byte strings and put them in an 8113 * int array for easy access. */ 8114 bi = 0; 8115 for (p = badword; *p != NUL; ) 8116 wbadword[bi++] = mb_cptr2char_adv(&p); 8117 wbadword[bi++] = 0; 8118 gi = 0; 8119 for (p = goodword; *p != NUL; ) 8120 wgoodword[gi++] = mb_cptr2char_adv(&p); 8121 wgoodword[gi++] = 0; 8122 8123 /* 8124 * The idea is to go from start to end over the words. So long as 8125 * characters are equal just continue, this always gives the lowest score. 8126 * When there is a difference try several alternatives. Each alternative 8127 * increases "score" for the edit distance. Some of the alternatives are 8128 * pushed unto a stack and tried later, some are tried right away. At the 8129 * end of the word the score for one alternative is known. The lowest 8130 * possible score is stored in "minscore". 8131 */ 8132 stackidx = 0; 8133 bi = 0; 8134 gi = 0; 8135 score = 0; 8136 minscore = limit + 1; 8137 8138 for (;;) 8139 { 8140 /* Skip over an equal part, score remains the same. */ 8141 for (;;) 8142 { 8143 bc = wbadword[bi]; 8144 gc = wgoodword[gi]; 8145 8146 if (bc != gc) /* stop at a char that's different */ 8147 break; 8148 if (bc == NUL) /* both words end */ 8149 { 8150 if (score < minscore) 8151 minscore = score; 8152 goto pop; /* do next alternative */ 8153 } 8154 ++bi; 8155 ++gi; 8156 } 8157 8158 if (gc == NUL) /* goodword ends, delete badword chars */ 8159 { 8160 do 8161 { 8162 if ((score += SCORE_DEL) >= minscore) 8163 goto pop; /* do next alternative */ 8164 } while (wbadword[++bi] != NUL); 8165 minscore = score; 8166 } 8167 else if (bc == NUL) /* badword ends, insert badword chars */ 8168 { 8169 do 8170 { 8171 if ((score += SCORE_INS) >= minscore) 8172 goto pop; /* do next alternative */ 8173 } while (wgoodword[++gi] != NUL); 8174 minscore = score; 8175 } 8176 else /* both words continue */ 8177 { 8178 /* If not close to the limit, perform a change. Only try changes 8179 * that may lead to a lower score than "minscore". 8180 * round 0: try deleting a char from badword 8181 * round 1: try inserting a char in badword */ 8182 for (round = 0; round <= 1; ++round) 8183 { 8184 score_off = score + (round == 0 ? SCORE_DEL : SCORE_INS); 8185 if (score_off < minscore) 8186 { 8187 if (score_off + SCORE_EDIT_MIN >= minscore) 8188 { 8189 /* Near the limit, rest of the words must match. We 8190 * can check that right now, no need to push an item 8191 * onto the stack. */ 8192 bi2 = bi + 1 - round; 8193 gi2 = gi + round; 8194 while (wgoodword[gi2] == wbadword[bi2]) 8195 { 8196 if (wgoodword[gi2] == NUL) 8197 { 8198 minscore = score_off; 8199 break; 8200 } 8201 ++bi2; 8202 ++gi2; 8203 } 8204 } 8205 else 8206 { 8207 /* try deleting a character from badword later */ 8208 stack[stackidx].badi = bi + 1 - round; 8209 stack[stackidx].goodi = gi + round; 8210 stack[stackidx].score = score_off; 8211 ++stackidx; 8212 } 8213 } 8214 } 8215 8216 if (score + SCORE_SWAP < minscore) 8217 { 8218 /* If swapping two characters makes a match then the 8219 * substitution is more expensive, thus there is no need to 8220 * try both. */ 8221 if (gc == wbadword[bi + 1] && bc == wgoodword[gi + 1]) 8222 { 8223 /* Swap two characters, that is: skip them. */ 8224 gi += 2; 8225 bi += 2; 8226 score += SCORE_SWAP; 8227 continue; 8228 } 8229 } 8230 8231 /* Substitute one character for another which is the same 8232 * thing as deleting a character from both goodword and badword. 8233 * Use a better score when there is only a case difference. */ 8234 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc)) 8235 score += SCORE_ICASE; 8236 else 8237 { 8238 /* For a similar character use SCORE_SIMILAR. */ 8239 if (slang != NULL 8240 && slang->sl_has_map 8241 && similar_chars(slang, gc, bc)) 8242 score += SCORE_SIMILAR; 8243 else 8244 score += SCORE_SUBST; 8245 } 8246 8247 if (score < minscore) 8248 { 8249 /* Do the substitution. */ 8250 ++gi; 8251 ++bi; 8252 continue; 8253 } 8254 } 8255 pop: 8256 /* 8257 * Get here to try the next alternative, pop it from the stack. 8258 */ 8259 if (stackidx == 0) /* stack is empty, finished */ 8260 break; 8261 8262 /* pop an item from the stack */ 8263 --stackidx; 8264 gi = stack[stackidx].goodi; 8265 bi = stack[stackidx].badi; 8266 score = stack[stackidx].score; 8267 } 8268 8269 /* When the score goes over "limit" it may actually be much higher. 8270 * Return a very large number to avoid going below the limit when giving a 8271 * bonus. */ 8272 if (minscore > limit) 8273 return SCORE_MAXMAX; 8274 return minscore; 8275 } 8276 8277 /* 8278 * ":spellinfo" 8279 */ 8280 void 8281 ex_spellinfo(exarg_T *eap UNUSED) 8282 { 8283 int lpi; 8284 langp_T *lp; 8285 char_u *p; 8286 8287 if (no_spell_checking(curwin)) 8288 return; 8289 8290 msg_start(); 8291 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len && !got_int; ++lpi) 8292 { 8293 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 8294 msg_puts("file: "); 8295 msg_puts((char *)lp->lp_slang->sl_fname); 8296 msg_putchar('\n'); 8297 p = lp->lp_slang->sl_info; 8298 if (p != NULL) 8299 { 8300 msg_puts((char *)p); 8301 msg_putchar('\n'); 8302 } 8303 } 8304 msg_end(); 8305 } 8306 8307 #define DUMPFLAG_KEEPCASE 1 /* round 2: keep-case tree */ 8308 #define DUMPFLAG_COUNT 2 /* include word count */ 8309 #define DUMPFLAG_ICASE 4 /* ignore case when finding matches */ 8310 #define DUMPFLAG_ONECAP 8 /* pattern starts with capital */ 8311 #define DUMPFLAG_ALLCAP 16 /* pattern is all capitals */ 8312 8313 /* 8314 * ":spelldump" 8315 */ 8316 void 8317 ex_spelldump(exarg_T *eap) 8318 { 8319 char_u *spl; 8320 long dummy; 8321 8322 if (no_spell_checking(curwin)) 8323 return; 8324 get_option_value((char_u*)"spl", &dummy, &spl, OPT_LOCAL); 8325 8326 /* Create a new empty buffer in a new window. */ 8327 do_cmdline_cmd((char_u *)"new"); 8328 8329 /* enable spelling locally in the new window */ 8330 set_option_value((char_u*)"spell", TRUE, (char_u*)"", OPT_LOCAL); 8331 set_option_value((char_u*)"spl", dummy, spl, OPT_LOCAL); 8332 vim_free(spl); 8333 8334 if (!BUFEMPTY()) 8335 return; 8336 8337 spell_dump_compl(NULL, 0, NULL, eap->forceit ? DUMPFLAG_COUNT : 0); 8338 8339 /* Delete the empty line that we started with. */ 8340 if (curbuf->b_ml.ml_line_count > 1) 8341 ml_delete(curbuf->b_ml.ml_line_count, FALSE); 8342 8343 redraw_later(NOT_VALID); 8344 } 8345 8346 /* 8347 * Go through all possible words and: 8348 * 1. When "pat" is NULL: dump a list of all words in the current buffer. 8349 * "ic" and "dir" are not used. 8350 * 2. When "pat" is not NULL: add matching words to insert mode completion. 8351 */ 8352 void 8353 spell_dump_compl( 8354 char_u *pat, /* leading part of the word */ 8355 int ic, /* ignore case */ 8356 int *dir, /* direction for adding matches */ 8357 int dumpflags_arg) /* DUMPFLAG_* */ 8358 { 8359 langp_T *lp; 8360 slang_T *slang; 8361 idx_T arridx[MAXWLEN]; 8362 int curi[MAXWLEN]; 8363 char_u word[MAXWLEN]; 8364 int c; 8365 char_u *byts; 8366 idx_T *idxs; 8367 linenr_T lnum = 0; 8368 int round; 8369 int depth; 8370 int n; 8371 int flags; 8372 char_u *region_names = NULL; /* region names being used */ 8373 int do_region = TRUE; /* dump region names and numbers */ 8374 char_u *p; 8375 int lpi; 8376 int dumpflags = dumpflags_arg; 8377 int patlen; 8378 8379 /* When ignoring case or when the pattern starts with capital pass this on 8380 * to dump_word(). */ 8381 if (pat != NULL) 8382 { 8383 if (ic) 8384 dumpflags |= DUMPFLAG_ICASE; 8385 else 8386 { 8387 n = captype(pat, NULL); 8388 if (n == WF_ONECAP) 8389 dumpflags |= DUMPFLAG_ONECAP; 8390 else if (n == WF_ALLCAP && (int)STRLEN(pat) > mb_ptr2len(pat)) 8391 dumpflags |= DUMPFLAG_ALLCAP; 8392 } 8393 } 8394 8395 /* Find out if we can support regions: All languages must support the same 8396 * regions or none at all. */ 8397 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 8398 { 8399 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 8400 p = lp->lp_slang->sl_regions; 8401 if (p[0] != 0) 8402 { 8403 if (region_names == NULL) /* first language with regions */ 8404 region_names = p; 8405 else if (STRCMP(region_names, p) != 0) 8406 { 8407 do_region = FALSE; /* region names are different */ 8408 break; 8409 } 8410 } 8411 } 8412 8413 if (do_region && region_names != NULL) 8414 { 8415 if (pat == NULL) 8416 { 8417 vim_snprintf((char *)IObuff, IOSIZE, "/regions=%s", region_names); 8418 ml_append(lnum++, IObuff, (colnr_T)0, FALSE); 8419 } 8420 } 8421 else 8422 do_region = FALSE; 8423 8424 /* 8425 * Loop over all files loaded for the entries in 'spelllang'. 8426 */ 8427 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 8428 { 8429 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 8430 slang = lp->lp_slang; 8431 if (slang->sl_fbyts == NULL) /* reloading failed */ 8432 continue; 8433 8434 if (pat == NULL) 8435 { 8436 vim_snprintf((char *)IObuff, IOSIZE, "# file: %s", slang->sl_fname); 8437 ml_append(lnum++, IObuff, (colnr_T)0, FALSE); 8438 } 8439 8440 /* When matching with a pattern and there are no prefixes only use 8441 * parts of the tree that match "pat". */ 8442 if (pat != NULL && slang->sl_pbyts == NULL) 8443 patlen = (int)STRLEN(pat); 8444 else 8445 patlen = -1; 8446 8447 /* round 1: case-folded tree 8448 * round 2: keep-case tree */ 8449 for (round = 1; round <= 2; ++round) 8450 { 8451 if (round == 1) 8452 { 8453 dumpflags &= ~DUMPFLAG_KEEPCASE; 8454 byts = slang->sl_fbyts; 8455 idxs = slang->sl_fidxs; 8456 } 8457 else 8458 { 8459 dumpflags |= DUMPFLAG_KEEPCASE; 8460 byts = slang->sl_kbyts; 8461 idxs = slang->sl_kidxs; 8462 } 8463 if (byts == NULL) 8464 continue; /* array is empty */ 8465 8466 depth = 0; 8467 arridx[0] = 0; 8468 curi[0] = 1; 8469 while (depth >= 0 && !got_int 8470 && (pat == NULL || !ins_compl_interrupted())) 8471 { 8472 if (curi[depth] > byts[arridx[depth]]) 8473 { 8474 /* Done all bytes at this node, go up one level. */ 8475 --depth; 8476 line_breakcheck(); 8477 ins_compl_check_keys(50, FALSE); 8478 } 8479 else 8480 { 8481 /* Do one more byte at this node. */ 8482 n = arridx[depth] + curi[depth]; 8483 ++curi[depth]; 8484 c = byts[n]; 8485 if (c == 0) 8486 { 8487 /* End of word, deal with the word. 8488 * Don't use keep-case words in the fold-case tree, 8489 * they will appear in the keep-case tree. 8490 * Only use the word when the region matches. */ 8491 flags = (int)idxs[n]; 8492 if ((round == 2 || (flags & WF_KEEPCAP) == 0) 8493 && (flags & WF_NEEDCOMP) == 0 8494 && (do_region 8495 || (flags & WF_REGION) == 0 8496 || (((unsigned)flags >> 16) 8497 & lp->lp_region) != 0)) 8498 { 8499 word[depth] = NUL; 8500 if (!do_region) 8501 flags &= ~WF_REGION; 8502 8503 /* Dump the basic word if there is no prefix or 8504 * when it's the first one. */ 8505 c = (unsigned)flags >> 24; 8506 if (c == 0 || curi[depth] == 2) 8507 { 8508 dump_word(slang, word, pat, dir, 8509 dumpflags, flags, lnum); 8510 if (pat == NULL) 8511 ++lnum; 8512 } 8513 8514 /* Apply the prefix, if there is one. */ 8515 if (c != 0) 8516 lnum = dump_prefixes(slang, word, pat, dir, 8517 dumpflags, flags, lnum); 8518 } 8519 } 8520 else 8521 { 8522 /* Normal char, go one level deeper. */ 8523 word[depth++] = c; 8524 arridx[depth] = idxs[n]; 8525 curi[depth] = 1; 8526 8527 /* Check if this characters matches with the pattern. 8528 * If not skip the whole tree below it. 8529 * Always ignore case here, dump_word() will check 8530 * proper case later. This isn't exactly right when 8531 * length changes for multi-byte characters with 8532 * ignore case... */ 8533 if (depth <= patlen 8534 && MB_STRNICMP(word, pat, depth) != 0) 8535 --depth; 8536 } 8537 } 8538 } 8539 } 8540 } 8541 } 8542 8543 /* 8544 * Dump one word: apply case modifications and append a line to the buffer. 8545 * When "lnum" is zero add insert mode completion. 8546 */ 8547 static void 8548 dump_word( 8549 slang_T *slang, 8550 char_u *word, 8551 char_u *pat, 8552 int *dir, 8553 int dumpflags, 8554 int wordflags, 8555 linenr_T lnum) 8556 { 8557 int keepcap = FALSE; 8558 char_u *p; 8559 char_u *tw; 8560 char_u cword[MAXWLEN]; 8561 char_u badword[MAXWLEN + 10]; 8562 int i; 8563 int flags = wordflags; 8564 8565 if (dumpflags & DUMPFLAG_ONECAP) 8566 flags |= WF_ONECAP; 8567 if (dumpflags & DUMPFLAG_ALLCAP) 8568 flags |= WF_ALLCAP; 8569 8570 if ((dumpflags & DUMPFLAG_KEEPCASE) == 0 && (flags & WF_CAPMASK) != 0) 8571 { 8572 /* Need to fix case according to "flags". */ 8573 make_case_word(word, cword, flags); 8574 p = cword; 8575 } 8576 else 8577 { 8578 p = word; 8579 if ((dumpflags & DUMPFLAG_KEEPCASE) 8580 && ((captype(word, NULL) & WF_KEEPCAP) == 0 8581 || (flags & WF_FIXCAP) != 0)) 8582 keepcap = TRUE; 8583 } 8584 tw = p; 8585 8586 if (pat == NULL) 8587 { 8588 /* Add flags and regions after a slash. */ 8589 if ((flags & (WF_BANNED | WF_RARE | WF_REGION)) || keepcap) 8590 { 8591 STRCPY(badword, p); 8592 STRCAT(badword, "/"); 8593 if (keepcap) 8594 STRCAT(badword, "="); 8595 if (flags & WF_BANNED) 8596 STRCAT(badword, "!"); 8597 else if (flags & WF_RARE) 8598 STRCAT(badword, "?"); 8599 if (flags & WF_REGION) 8600 for (i = 0; i < 7; ++i) 8601 if (flags & (0x10000 << i)) 8602 sprintf((char *)badword + STRLEN(badword), "%d", i + 1); 8603 p = badword; 8604 } 8605 8606 if (dumpflags & DUMPFLAG_COUNT) 8607 { 8608 hashitem_T *hi; 8609 8610 /* Include the word count for ":spelldump!". */ 8611 hi = hash_find(&slang->sl_wordcount, tw); 8612 if (!HASHITEM_EMPTY(hi)) 8613 { 8614 vim_snprintf((char *)IObuff, IOSIZE, "%s\t%d", 8615 tw, HI2WC(hi)->wc_count); 8616 p = IObuff; 8617 } 8618 } 8619 8620 ml_append(lnum, p, (colnr_T)0, FALSE); 8621 } 8622 else if (((dumpflags & DUMPFLAG_ICASE) 8623 ? MB_STRNICMP(p, pat, STRLEN(pat)) == 0 8624 : STRNCMP(p, pat, STRLEN(pat)) == 0) 8625 && ins_compl_add_infercase(p, (int)STRLEN(p), 8626 p_ic, NULL, *dir, FALSE) == OK) 8627 /* if dir was BACKWARD then honor it just once */ 8628 *dir = FORWARD; 8629 } 8630 8631 /* 8632 * For ":spelldump": Find matching prefixes for "word". Prepend each to 8633 * "word" and append a line to the buffer. 8634 * When "lnum" is zero add insert mode completion. 8635 * Return the updated line number. 8636 */ 8637 static linenr_T 8638 dump_prefixes( 8639 slang_T *slang, 8640 char_u *word, /* case-folded word */ 8641 char_u *pat, 8642 int *dir, 8643 int dumpflags, 8644 int flags, /* flags with prefix ID */ 8645 linenr_T startlnum) 8646 { 8647 idx_T arridx[MAXWLEN]; 8648 int curi[MAXWLEN]; 8649 char_u prefix[MAXWLEN]; 8650 char_u word_up[MAXWLEN]; 8651 int has_word_up = FALSE; 8652 int c; 8653 char_u *byts; 8654 idx_T *idxs; 8655 linenr_T lnum = startlnum; 8656 int depth; 8657 int n; 8658 int len; 8659 int i; 8660 8661 /* If the word starts with a lower-case letter make the word with an 8662 * upper-case letter in word_up[]. */ 8663 c = PTR2CHAR(word); 8664 if (SPELL_TOUPPER(c) != c) 8665 { 8666 onecap_copy(word, word_up, TRUE); 8667 has_word_up = TRUE; 8668 } 8669 8670 byts = slang->sl_pbyts; 8671 idxs = slang->sl_pidxs; 8672 if (byts != NULL) /* array not is empty */ 8673 { 8674 /* 8675 * Loop over all prefixes, building them byte-by-byte in prefix[]. 8676 * When at the end of a prefix check that it supports "flags". 8677 */ 8678 depth = 0; 8679 arridx[0] = 0; 8680 curi[0] = 1; 8681 while (depth >= 0 && !got_int) 8682 { 8683 n = arridx[depth]; 8684 len = byts[n]; 8685 if (curi[depth] > len) 8686 { 8687 /* Done all bytes at this node, go up one level. */ 8688 --depth; 8689 line_breakcheck(); 8690 } 8691 else 8692 { 8693 /* Do one more byte at this node. */ 8694 n += curi[depth]; 8695 ++curi[depth]; 8696 c = byts[n]; 8697 if (c == 0) 8698 { 8699 /* End of prefix, find out how many IDs there are. */ 8700 for (i = 1; i < len; ++i) 8701 if (byts[n + i] != 0) 8702 break; 8703 curi[depth] += i - 1; 8704 8705 c = valid_word_prefix(i, n, flags, word, slang, FALSE); 8706 if (c != 0) 8707 { 8708 vim_strncpy(prefix + depth, word, MAXWLEN - depth - 1); 8709 dump_word(slang, prefix, pat, dir, dumpflags, 8710 (c & WF_RAREPFX) ? (flags | WF_RARE) 8711 : flags, lnum); 8712 if (lnum != 0) 8713 ++lnum; 8714 } 8715 8716 /* Check for prefix that matches the word when the 8717 * first letter is upper-case, but only if the prefix has 8718 * a condition. */ 8719 if (has_word_up) 8720 { 8721 c = valid_word_prefix(i, n, flags, word_up, slang, 8722 TRUE); 8723 if (c != 0) 8724 { 8725 vim_strncpy(prefix + depth, word_up, 8726 MAXWLEN - depth - 1); 8727 dump_word(slang, prefix, pat, dir, dumpflags, 8728 (c & WF_RAREPFX) ? (flags | WF_RARE) 8729 : flags, lnum); 8730 if (lnum != 0) 8731 ++lnum; 8732 } 8733 } 8734 } 8735 else 8736 { 8737 /* Normal char, go one level deeper. */ 8738 prefix[depth++] = c; 8739 arridx[depth] = idxs[n]; 8740 curi[depth] = 1; 8741 } 8742 } 8743 } 8744 } 8745 8746 return lnum; 8747 } 8748 8749 /* 8750 * Move "p" to the end of word "start". 8751 * Uses the spell-checking word characters. 8752 */ 8753 char_u * 8754 spell_to_word_end(char_u *start, win_T *win) 8755 { 8756 char_u *p = start; 8757 8758 while (*p != NUL && spell_iswordp(p, win)) 8759 MB_PTR_ADV(p); 8760 return p; 8761 } 8762 8763 #if defined(FEAT_INS_EXPAND) || defined(PROTO) 8764 /* 8765 * For Insert mode completion CTRL-X s: 8766 * Find start of the word in front of column "startcol". 8767 * We don't check if it is badly spelled, with completion we can only change 8768 * the word in front of the cursor. 8769 * Returns the column number of the word. 8770 */ 8771 int 8772 spell_word_start(int startcol) 8773 { 8774 char_u *line; 8775 char_u *p; 8776 int col = 0; 8777 8778 if (no_spell_checking(curwin)) 8779 return startcol; 8780 8781 /* Find a word character before "startcol". */ 8782 line = ml_get_curline(); 8783 for (p = line + startcol; p > line; ) 8784 { 8785 MB_PTR_BACK(line, p); 8786 if (spell_iswordp_nmw(p, curwin)) 8787 break; 8788 } 8789 8790 /* Go back to start of the word. */ 8791 while (p > line) 8792 { 8793 col = (int)(p - line); 8794 MB_PTR_BACK(line, p); 8795 if (!spell_iswordp(p, curwin)) 8796 break; 8797 col = 0; 8798 } 8799 8800 return col; 8801 } 8802 8803 /* 8804 * Need to check for 'spellcapcheck' now, the word is removed before 8805 * expand_spelling() is called. Therefore the ugly global variable. 8806 */ 8807 static int spell_expand_need_cap; 8808 8809 void 8810 spell_expand_check_cap(colnr_T col) 8811 { 8812 spell_expand_need_cap = check_need_cap(curwin->w_cursor.lnum, col); 8813 } 8814 8815 /* 8816 * Get list of spelling suggestions. 8817 * Used for Insert mode completion CTRL-X ?. 8818 * Returns the number of matches. The matches are in "matchp[]", array of 8819 * allocated strings. 8820 */ 8821 int 8822 expand_spelling( 8823 linenr_T lnum UNUSED, 8824 char_u *pat, 8825 char_u ***matchp) 8826 { 8827 garray_T ga; 8828 8829 spell_suggest_list(&ga, pat, 100, spell_expand_need_cap, TRUE); 8830 *matchp = ga.ga_data; 8831 return ga.ga_len; 8832 } 8833 #endif 8834 8835 #endif /* FEAT_SPELL */ 8836