1 /* vi:set ts=8 sts=4 sw=4 noet: 2 * 3 * VIM - Vi IMproved by Bram Moolenaar 4 * 5 * Do ":help uganda" in Vim to read copying and usage conditions. 6 * Do ":help credits" in Vim to see a list of people who contributed. 7 * See README.txt for an overview of the Vim source code. 8 */ 9 10 /* 11 * spell.c: code for spell checking 12 * 13 * See spellfile.c for the Vim spell file format. 14 * 15 * The spell checking mechanism uses a tree (aka trie). Each node in the tree 16 * has a list of bytes that can appear (siblings). For each byte there is a 17 * pointer to the node with the byte that follows in the word (child). 18 * 19 * A NUL byte is used where the word may end. The bytes are sorted, so that 20 * binary searching can be used and the NUL bytes are at the start. The 21 * number of possible bytes is stored before the list of bytes. 22 * 23 * The tree uses two arrays: "byts" stores the characters, "idxs" stores 24 * either the next index or flags. The tree starts at index 0. For example, 25 * to lookup "vi" this sequence is followed: 26 * i = 0 27 * len = byts[i] 28 * n = where "v" appears in byts[i + 1] to byts[i + len] 29 * i = idxs[n] 30 * len = byts[i] 31 * n = where "i" appears in byts[i + 1] to byts[i + len] 32 * i = idxs[n] 33 * len = byts[i] 34 * find that byts[i + 1] is 0, idxs[i + 1] has flags for "vi". 35 * 36 * There are two word trees: one with case-folded words and one with words in 37 * original case. The second one is only used for keep-case words and is 38 * usually small. 39 * 40 * There is one additional tree for when not all prefixes are applied when 41 * generating the .spl file. This tree stores all the possible prefixes, as 42 * if they were words. At each word (prefix) end the prefix nr is stored, the 43 * following word must support this prefix nr. And the condition nr is 44 * stored, used to lookup the condition that the word must match with. 45 * 46 * Thanks to Olaf Seibert for providing an example implementation of this tree 47 * and the compression mechanism. 48 * LZ trie ideas: 49 * http://www.irb.hr/hr/home/ristov/papers/RistovLZtrieRevision1.pdf 50 * More papers: http://www-igm.univ-mlv.fr/~laporte/publi_en.html 51 * 52 * Matching involves checking the caps type: Onecap ALLCAP KeepCap. 53 * 54 * Why doesn't Vim use aspell/ispell/myspell/etc.? 55 * See ":help develop-spell". 56 */ 57 58 /* 59 * Use this to adjust the score after finding suggestions, based on the 60 * suggested word sounding like the bad word. This is much faster than doing 61 * it for every possible suggestion. 62 * Disadvantage: When "the" is typed as "hte" it sounds quite different ("@" 63 * vs "ht") and goes down in the list. 64 * Used when 'spellsuggest' is set to "best". 65 */ 66 #define RESCORE(word_score, sound_score) ((3 * word_score + sound_score) / 4) 67 68 /* 69 * Do the opposite: based on a maximum end score and a known sound score, 70 * compute the maximum word score that can be used. 71 */ 72 #define MAXSCORE(word_score, sound_score) ((4 * word_score - sound_score) / 3) 73 74 #define IN_SPELL_C 75 #include "vim.h" 76 77 #if defined(FEAT_SPELL) || defined(PROTO) 78 79 #ifndef UNIX /* it's in os_unix.h for Unix */ 80 # include <time.h> /* for time_t */ 81 #endif 82 83 /* only used for su_badflags */ 84 #define WF_MIXCAP 0x20 /* mix of upper and lower case: macaRONI */ 85 86 #define WF_CAPMASK (WF_ONECAP | WF_ALLCAP | WF_KEEPCAP | WF_FIXCAP) 87 88 #define REGION_ALL 0xff /* word valid in all regions */ 89 90 #define VIMSUGMAGIC "VIMsug" /* string at start of Vim .sug file */ 91 #define VIMSUGMAGICL 6 92 #define VIMSUGVERSION 1 93 94 /* Result values. Lower number is accepted over higher one. */ 95 #define SP_BANNED -1 96 #define SP_OK 0 97 #define SP_RARE 1 98 #define SP_LOCAL 2 99 #define SP_BAD 3 100 101 typedef struct wordcount_S 102 { 103 short_u wc_count; /* nr of times word was seen */ 104 char_u wc_word[1]; /* word, actually longer */ 105 } wordcount_T; 106 107 #define WC_KEY_OFF offsetof(wordcount_T, wc_word) 108 #define HI2WC(hi) ((wordcount_T *)((hi)->hi_key - WC_KEY_OFF)) 109 #define MAXWORDCOUNT 0xffff 110 111 /* 112 * Information used when looking for suggestions. 113 */ 114 typedef struct suginfo_S 115 { 116 garray_T su_ga; /* suggestions, contains "suggest_T" */ 117 int su_maxcount; /* max. number of suggestions displayed */ 118 int su_maxscore; /* maximum score for adding to su_ga */ 119 int su_sfmaxscore; /* idem, for when doing soundfold words */ 120 garray_T su_sga; /* like su_ga, sound-folded scoring */ 121 char_u *su_badptr; /* start of bad word in line */ 122 int su_badlen; /* length of detected bad word in line */ 123 int su_badflags; /* caps flags for bad word */ 124 char_u su_badword[MAXWLEN]; /* bad word truncated at su_badlen */ 125 char_u su_fbadword[MAXWLEN]; /* su_badword case-folded */ 126 char_u su_sal_badword[MAXWLEN]; /* su_badword soundfolded */ 127 hashtab_T su_banned; /* table with banned words */ 128 slang_T *su_sallang; /* default language for sound folding */ 129 } suginfo_T; 130 131 /* One word suggestion. Used in "si_ga". */ 132 typedef struct suggest_S 133 { 134 char_u *st_word; /* suggested word, allocated string */ 135 int st_wordlen; /* STRLEN(st_word) */ 136 int st_orglen; /* length of replaced text */ 137 int st_score; /* lower is better */ 138 int st_altscore; /* used when st_score compares equal */ 139 int st_salscore; /* st_score is for soundalike */ 140 int st_had_bonus; /* bonus already included in score */ 141 slang_T *st_slang; /* language used for sound folding */ 142 } suggest_T; 143 144 #define SUG(ga, i) (((suggest_T *)(ga).ga_data)[i]) 145 146 /* TRUE if a word appears in the list of banned words. */ 147 #define WAS_BANNED(su, word) (!HASHITEM_EMPTY(hash_find(&su->su_banned, word))) 148 149 /* Number of suggestions kept when cleaning up. We need to keep more than 150 * what is displayed, because when rescore_suggestions() is called the score 151 * may change and wrong suggestions may be removed later. */ 152 #define SUG_CLEAN_COUNT(su) ((su)->su_maxcount < 130 ? 150 : (su)->su_maxcount + 20) 153 154 /* Threshold for sorting and cleaning up suggestions. Don't want to keep lots 155 * of suggestions that are not going to be displayed. */ 156 #define SUG_MAX_COUNT(su) (SUG_CLEAN_COUNT(su) + 50) 157 158 /* score for various changes */ 159 #define SCORE_SPLIT 149 /* split bad word */ 160 #define SCORE_SPLIT_NO 249 /* split bad word with NOSPLITSUGS */ 161 #define SCORE_ICASE 52 /* slightly different case */ 162 #define SCORE_REGION 200 /* word is for different region */ 163 #define SCORE_RARE 180 /* rare word */ 164 #define SCORE_SWAP 75 /* swap two characters */ 165 #define SCORE_SWAP3 110 /* swap two characters in three */ 166 #define SCORE_REP 65 /* REP replacement */ 167 #define SCORE_SUBST 93 /* substitute a character */ 168 #define SCORE_SIMILAR 33 /* substitute a similar character */ 169 #define SCORE_SUBCOMP 33 /* substitute a composing character */ 170 #define SCORE_DEL 94 /* delete a character */ 171 #define SCORE_DELDUP 66 /* delete a duplicated character */ 172 #define SCORE_DELCOMP 28 /* delete a composing character */ 173 #define SCORE_INS 96 /* insert a character */ 174 #define SCORE_INSDUP 67 /* insert a duplicate character */ 175 #define SCORE_INSCOMP 30 /* insert a composing character */ 176 #define SCORE_NONWORD 103 /* change non-word to word char */ 177 178 #define SCORE_FILE 30 /* suggestion from a file */ 179 #define SCORE_MAXINIT 350 /* Initial maximum score: higher == slower. 180 * 350 allows for about three changes. */ 181 182 #define SCORE_COMMON1 30 /* subtracted for words seen before */ 183 #define SCORE_COMMON2 40 /* subtracted for words often seen */ 184 #define SCORE_COMMON3 50 /* subtracted for words very often seen */ 185 #define SCORE_THRES2 10 /* word count threshold for COMMON2 */ 186 #define SCORE_THRES3 100 /* word count threshold for COMMON3 */ 187 188 /* When trying changed soundfold words it becomes slow when trying more than 189 * two changes. With less then two changes it's slightly faster but we miss a 190 * few good suggestions. In rare cases we need to try three of four changes. 191 */ 192 #define SCORE_SFMAX1 200 /* maximum score for first try */ 193 #define SCORE_SFMAX2 300 /* maximum score for second try */ 194 #define SCORE_SFMAX3 400 /* maximum score for third try */ 195 196 #define SCORE_BIG SCORE_INS * 3 /* big difference */ 197 #define SCORE_MAXMAX 999999 /* accept any score */ 198 #define SCORE_LIMITMAX 350 /* for spell_edit_score_limit() */ 199 200 /* for spell_edit_score_limit() we need to know the minimum value of 201 * SCORE_ICASE, SCORE_SWAP, SCORE_DEL, SCORE_SIMILAR and SCORE_INS */ 202 #define SCORE_EDIT_MIN SCORE_SIMILAR 203 204 /* 205 * Structure to store info for word matching. 206 */ 207 typedef struct matchinf_S 208 { 209 langp_T *mi_lp; /* info for language and region */ 210 211 /* pointers to original text to be checked */ 212 char_u *mi_word; /* start of word being checked */ 213 char_u *mi_end; /* end of matching word so far */ 214 char_u *mi_fend; /* next char to be added to mi_fword */ 215 char_u *mi_cend; /* char after what was used for 216 mi_capflags */ 217 218 /* case-folded text */ 219 char_u mi_fword[MAXWLEN + 1]; /* mi_word case-folded */ 220 int mi_fwordlen; /* nr of valid bytes in mi_fword */ 221 222 /* for when checking word after a prefix */ 223 int mi_prefarridx; /* index in sl_pidxs with list of 224 affixID/condition */ 225 int mi_prefcnt; /* number of entries at mi_prefarridx */ 226 int mi_prefixlen; /* byte length of prefix */ 227 int mi_cprefixlen; /* byte length of prefix in original 228 case */ 229 230 /* for when checking a compound word */ 231 int mi_compoff; /* start of following word offset */ 232 char_u mi_compflags[MAXWLEN]; /* flags for compound words used */ 233 int mi_complen; /* nr of compound words used */ 234 int mi_compextra; /* nr of COMPOUNDROOT words */ 235 236 /* others */ 237 int mi_result; /* result so far: SP_BAD, SP_OK, etc. */ 238 int mi_capflags; /* WF_ONECAP WF_ALLCAP WF_KEEPCAP */ 239 win_T *mi_win; /* buffer being checked */ 240 241 /* for NOBREAK */ 242 int mi_result2; /* "mi_resul" without following word */ 243 char_u *mi_end2; /* "mi_end" without following word */ 244 } matchinf_T; 245 246 247 static int spell_iswordp(char_u *p, win_T *wp); 248 static int spell_mb_isword_class(int cl, win_T *wp); 249 250 /* 251 * For finding suggestions: At each node in the tree these states are tried: 252 */ 253 typedef enum 254 { 255 STATE_START = 0, /* At start of node check for NUL bytes (goodword 256 * ends); if badword ends there is a match, otherwise 257 * try splitting word. */ 258 STATE_NOPREFIX, /* try without prefix */ 259 STATE_SPLITUNDO, /* Undo splitting. */ 260 STATE_ENDNUL, /* Past NUL bytes at start of the node. */ 261 STATE_PLAIN, /* Use each byte of the node. */ 262 STATE_DEL, /* Delete a byte from the bad word. */ 263 STATE_INS_PREP, /* Prepare for inserting bytes. */ 264 STATE_INS, /* Insert a byte in the bad word. */ 265 STATE_SWAP, /* Swap two bytes. */ 266 STATE_UNSWAP, /* Undo swap two characters. */ 267 STATE_SWAP3, /* Swap two characters over three. */ 268 STATE_UNSWAP3, /* Undo Swap two characters over three. */ 269 STATE_UNROT3L, /* Undo rotate three characters left */ 270 STATE_UNROT3R, /* Undo rotate three characters right */ 271 STATE_REP_INI, /* Prepare for using REP items. */ 272 STATE_REP, /* Use matching REP items from the .aff file. */ 273 STATE_REP_UNDO, /* Undo a REP item replacement. */ 274 STATE_FINAL /* End of this node. */ 275 } state_T; 276 277 /* 278 * Struct to keep the state at each level in suggest_try_change(). 279 */ 280 typedef struct trystate_S 281 { 282 state_T ts_state; /* state at this level, STATE_ */ 283 int ts_score; /* score */ 284 idx_T ts_arridx; /* index in tree array, start of node */ 285 short ts_curi; /* index in list of child nodes */ 286 char_u ts_fidx; /* index in fword[], case-folded bad word */ 287 char_u ts_fidxtry; /* ts_fidx at which bytes may be changed */ 288 char_u ts_twordlen; /* valid length of tword[] */ 289 char_u ts_prefixdepth; /* stack depth for end of prefix or 290 * PFD_PREFIXTREE or PFD_NOPREFIX */ 291 char_u ts_flags; /* TSF_ flags */ 292 char_u ts_tcharlen; /* number of bytes in tword character */ 293 char_u ts_tcharidx; /* current byte index in tword character */ 294 char_u ts_isdiff; /* DIFF_ values */ 295 char_u ts_fcharstart; /* index in fword where badword char started */ 296 char_u ts_prewordlen; /* length of word in "preword[]" */ 297 char_u ts_splitoff; /* index in "tword" after last split */ 298 char_u ts_splitfidx; /* "ts_fidx" at word split */ 299 char_u ts_complen; /* nr of compound words used */ 300 char_u ts_compsplit; /* index for "compflags" where word was spit */ 301 char_u ts_save_badflags; /* su_badflags saved here */ 302 char_u ts_delidx; /* index in fword for char that was deleted, 303 valid when "ts_flags" has TSF_DIDDEL */ 304 } trystate_T; 305 306 /* values for ts_isdiff */ 307 #define DIFF_NONE 0 /* no different byte (yet) */ 308 #define DIFF_YES 1 /* different byte found */ 309 #define DIFF_INSERT 2 /* inserting character */ 310 311 /* values for ts_flags */ 312 #define TSF_PREFIXOK 1 /* already checked that prefix is OK */ 313 #define TSF_DIDSPLIT 2 /* tried split at this point */ 314 #define TSF_DIDDEL 4 /* did a delete, "ts_delidx" has index */ 315 316 /* special values ts_prefixdepth */ 317 #define PFD_NOPREFIX 0xff /* not using prefixes */ 318 #define PFD_PREFIXTREE 0xfe /* walking through the prefix tree */ 319 #define PFD_NOTSPECIAL 0xfd /* highest value that's not special */ 320 321 /* mode values for find_word */ 322 #define FIND_FOLDWORD 0 /* find word case-folded */ 323 #define FIND_KEEPWORD 1 /* find keep-case word */ 324 #define FIND_PREFIX 2 /* find word after prefix */ 325 #define FIND_COMPOUND 3 /* find case-folded compound word */ 326 #define FIND_KEEPCOMPOUND 4 /* find keep-case compound word */ 327 328 static void find_word(matchinf_T *mip, int mode); 329 static int match_checkcompoundpattern(char_u *ptr, int wlen, garray_T *gap); 330 static int can_compound(slang_T *slang, char_u *word, char_u *flags); 331 static int match_compoundrule(slang_T *slang, char_u *compflags); 332 static int valid_word_prefix(int totprefcnt, int arridx, int flags, char_u *word, slang_T *slang, int cond_req); 333 static void find_prefix(matchinf_T *mip, int mode); 334 static int fold_more(matchinf_T *mip); 335 static int spell_valid_case(int wordflags, int treeflags); 336 static void spell_load_cb(char_u *fname, void *cookie); 337 static int count_syllables(slang_T *slang, char_u *word); 338 static void clear_midword(win_T *buf); 339 static void use_midword(slang_T *lp, win_T *buf); 340 static int find_region(char_u *rp, char_u *region); 341 static int check_need_cap(linenr_T lnum, colnr_T col); 342 static void spell_find_suggest(char_u *badptr, int badlen, suginfo_T *su, int maxcount, int banbadword, int need_cap, int interactive); 343 #ifdef FEAT_EVAL 344 static void spell_suggest_expr(suginfo_T *su, char_u *expr); 345 #endif 346 static void spell_suggest_file(suginfo_T *su, char_u *fname); 347 static void spell_suggest_intern(suginfo_T *su, int interactive); 348 static void spell_find_cleanup(suginfo_T *su); 349 static void suggest_try_special(suginfo_T *su); 350 static void suggest_try_change(suginfo_T *su); 351 static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char_u *fword, int soundfold); 352 static void go_deeper(trystate_T *stack, int depth, int score_add); 353 static int nofold_len(char_u *fword, int flen, char_u *word); 354 static void find_keepcap_word(slang_T *slang, char_u *fword, char_u *kword); 355 static void score_comp_sal(suginfo_T *su); 356 static void score_combine(suginfo_T *su); 357 static int stp_sal_score(suggest_T *stp, suginfo_T *su, slang_T *slang, char_u *badsound); 358 static void suggest_try_soundalike_prep(void); 359 static void suggest_try_soundalike(suginfo_T *su); 360 static void suggest_try_soundalike_finish(void); 361 static void add_sound_suggest(suginfo_T *su, char_u *goodword, int score, langp_T *lp); 362 static int soundfold_find(slang_T *slang, char_u *word); 363 static void make_case_word(char_u *fword, char_u *cword, int flags); 364 static int similar_chars(slang_T *slang, int c1, int c2); 365 static void add_suggestion(suginfo_T *su, garray_T *gap, char_u *goodword, int badlen, int score, int altscore, int had_bonus, slang_T *slang, int maxsf); 366 static void check_suggestions(suginfo_T *su, garray_T *gap); 367 static void add_banned(suginfo_T *su, char_u *word); 368 static void rescore_suggestions(suginfo_T *su); 369 static void rescore_one(suginfo_T *su, suggest_T *stp); 370 static int cleanup_suggestions(garray_T *gap, int maxscore, int keep); 371 static void spell_soundfold_sofo(slang_T *slang, char_u *inword, char_u *res); 372 static void spell_soundfold_sal(slang_T *slang, char_u *inword, char_u *res); 373 static void spell_soundfold_wsal(slang_T *slang, char_u *inword, char_u *res); 374 static int soundalike_score(char_u *goodsound, char_u *badsound); 375 static int spell_edit_score(slang_T *slang, char_u *badword, char_u *goodword); 376 static int spell_edit_score_limit(slang_T *slang, char_u *badword, char_u *goodword, int limit); 377 static int spell_edit_score_limit_w(slang_T *slang, char_u *badword, char_u *goodword, int limit); 378 static void dump_word(slang_T *slang, char_u *word, char_u *pat, int *dir, int round, int flags, linenr_T lnum); 379 static linenr_T dump_prefixes(slang_T *slang, char_u *word, char_u *pat, int *dir, int round, int flags, linenr_T startlnum); 380 381 382 /* Remember what "z?" replaced. */ 383 static char_u *repl_from = NULL; 384 static char_u *repl_to = NULL; 385 386 /* 387 * Main spell-checking function. 388 * "ptr" points to a character that could be the start of a word. 389 * "*attrp" is set to the highlight index for a badly spelled word. For a 390 * non-word or when it's OK it remains unchanged. 391 * This must only be called when 'spelllang' is not empty. 392 * 393 * "capcol" is used to check for a Capitalised word after the end of a 394 * sentence. If it's zero then perform the check. Return the column where to 395 * check next, or -1 when no sentence end was found. If it's NULL then don't 396 * worry. 397 * 398 * Returns the length of the word in bytes, also when it's OK, so that the 399 * caller can skip over the word. 400 */ 401 int 402 spell_check( 403 win_T *wp, /* current window */ 404 char_u *ptr, 405 hlf_T *attrp, 406 int *capcol, /* column to check for Capital */ 407 int docount) /* count good words */ 408 { 409 matchinf_T mi; /* Most things are put in "mi" so that it can 410 be passed to functions quickly. */ 411 int nrlen = 0; /* found a number first */ 412 int c; 413 int wrongcaplen = 0; 414 int lpi; 415 int count_word = docount; 416 417 /* A word never starts at a space or a control character. Return quickly 418 * then, skipping over the character. */ 419 if (*ptr <= ' ') 420 return 1; 421 422 /* Return here when loading language files failed. */ 423 if (wp->w_s->b_langp.ga_len == 0) 424 return 1; 425 426 vim_memset(&mi, 0, sizeof(matchinf_T)); 427 428 /* A number is always OK. Also skip hexadecimal numbers 0xFF99 and 429 * 0X99FF. But always do check spelling to find "3GPP" and "11 430 * julifeest". */ 431 if (*ptr >= '0' && *ptr <= '9') 432 { 433 if (*ptr == '0' && (ptr[1] == 'b' || ptr[1] == 'B')) 434 mi.mi_end = skipbin(ptr + 2); 435 else if (*ptr == '0' && (ptr[1] == 'x' || ptr[1] == 'X')) 436 mi.mi_end = skiphex(ptr + 2); 437 else 438 mi.mi_end = skipdigits(ptr); 439 nrlen = (int)(mi.mi_end - ptr); 440 } 441 442 /* Find the normal end of the word (until the next non-word character). */ 443 mi.mi_word = ptr; 444 mi.mi_fend = ptr; 445 if (spell_iswordp(mi.mi_fend, wp)) 446 { 447 do 448 MB_PTR_ADV(mi.mi_fend); 449 while (*mi.mi_fend != NUL && spell_iswordp(mi.mi_fend, wp)); 450 451 if (capcol != NULL && *capcol == 0 && wp->w_s->b_cap_prog != NULL) 452 { 453 /* Check word starting with capital letter. */ 454 c = PTR2CHAR(ptr); 455 if (!SPELL_ISUPPER(c)) 456 wrongcaplen = (int)(mi.mi_fend - ptr); 457 } 458 } 459 if (capcol != NULL) 460 *capcol = -1; 461 462 /* We always use the characters up to the next non-word character, 463 * also for bad words. */ 464 mi.mi_end = mi.mi_fend; 465 466 /* Check caps type later. */ 467 mi.mi_capflags = 0; 468 mi.mi_cend = NULL; 469 mi.mi_win = wp; 470 471 /* case-fold the word with one non-word character, so that we can check 472 * for the word end. */ 473 if (*mi.mi_fend != NUL) 474 MB_PTR_ADV(mi.mi_fend); 475 476 (void)spell_casefold(ptr, (int)(mi.mi_fend - ptr), mi.mi_fword, 477 MAXWLEN + 1); 478 mi.mi_fwordlen = (int)STRLEN(mi.mi_fword); 479 480 /* The word is bad unless we recognize it. */ 481 mi.mi_result = SP_BAD; 482 mi.mi_result2 = SP_BAD; 483 484 /* 485 * Loop over the languages specified in 'spelllang'. 486 * We check them all, because a word may be matched longer in another 487 * language. 488 */ 489 for (lpi = 0; lpi < wp->w_s->b_langp.ga_len; ++lpi) 490 { 491 mi.mi_lp = LANGP_ENTRY(wp->w_s->b_langp, lpi); 492 493 /* If reloading fails the language is still in the list but everything 494 * has been cleared. */ 495 if (mi.mi_lp->lp_slang->sl_fidxs == NULL) 496 continue; 497 498 /* Check for a matching word in case-folded words. */ 499 find_word(&mi, FIND_FOLDWORD); 500 501 /* Check for a matching word in keep-case words. */ 502 find_word(&mi, FIND_KEEPWORD); 503 504 /* Check for matching prefixes. */ 505 find_prefix(&mi, FIND_FOLDWORD); 506 507 /* For a NOBREAK language, may want to use a word without a following 508 * word as a backup. */ 509 if (mi.mi_lp->lp_slang->sl_nobreak && mi.mi_result == SP_BAD 510 && mi.mi_result2 != SP_BAD) 511 { 512 mi.mi_result = mi.mi_result2; 513 mi.mi_end = mi.mi_end2; 514 } 515 516 /* Count the word in the first language where it's found to be OK. */ 517 if (count_word && mi.mi_result == SP_OK) 518 { 519 count_common_word(mi.mi_lp->lp_slang, ptr, 520 (int)(mi.mi_end - ptr), 1); 521 count_word = FALSE; 522 } 523 } 524 525 if (mi.mi_result != SP_OK) 526 { 527 /* If we found a number skip over it. Allows for "42nd". Do flag 528 * rare and local words, e.g., "3GPP". */ 529 if (nrlen > 0) 530 { 531 if (mi.mi_result == SP_BAD || mi.mi_result == SP_BANNED) 532 return nrlen; 533 } 534 535 /* When we are at a non-word character there is no error, just 536 * skip over the character (try looking for a word after it). */ 537 else if (!spell_iswordp_nmw(ptr, wp)) 538 { 539 if (capcol != NULL && wp->w_s->b_cap_prog != NULL) 540 { 541 regmatch_T regmatch; 542 int r; 543 544 /* Check for end of sentence. */ 545 regmatch.regprog = wp->w_s->b_cap_prog; 546 regmatch.rm_ic = FALSE; 547 r = vim_regexec(®match, ptr, 0); 548 wp->w_s->b_cap_prog = regmatch.regprog; 549 if (r) 550 *capcol = (int)(regmatch.endp[0] - ptr); 551 } 552 553 if (has_mbyte) 554 return (*mb_ptr2len)(ptr); 555 return 1; 556 } 557 else if (mi.mi_end == ptr) 558 /* Always include at least one character. Required for when there 559 * is a mixup in "midword". */ 560 MB_PTR_ADV(mi.mi_end); 561 else if (mi.mi_result == SP_BAD 562 && LANGP_ENTRY(wp->w_s->b_langp, 0)->lp_slang->sl_nobreak) 563 { 564 char_u *p, *fp; 565 int save_result = mi.mi_result; 566 567 /* First language in 'spelllang' is NOBREAK. Find first position 568 * at which any word would be valid. */ 569 mi.mi_lp = LANGP_ENTRY(wp->w_s->b_langp, 0); 570 if (mi.mi_lp->lp_slang->sl_fidxs != NULL) 571 { 572 p = mi.mi_word; 573 fp = mi.mi_fword; 574 for (;;) 575 { 576 MB_PTR_ADV(p); 577 MB_PTR_ADV(fp); 578 if (p >= mi.mi_end) 579 break; 580 mi.mi_compoff = (int)(fp - mi.mi_fword); 581 find_word(&mi, FIND_COMPOUND); 582 if (mi.mi_result != SP_BAD) 583 { 584 mi.mi_end = p; 585 break; 586 } 587 } 588 mi.mi_result = save_result; 589 } 590 } 591 592 if (mi.mi_result == SP_BAD || mi.mi_result == SP_BANNED) 593 *attrp = HLF_SPB; 594 else if (mi.mi_result == SP_RARE) 595 *attrp = HLF_SPR; 596 else 597 *attrp = HLF_SPL; 598 } 599 600 if (wrongcaplen > 0 && (mi.mi_result == SP_OK || mi.mi_result == SP_RARE)) 601 { 602 /* Report SpellCap only when the word isn't badly spelled. */ 603 *attrp = HLF_SPC; 604 return wrongcaplen; 605 } 606 607 return (int)(mi.mi_end - ptr); 608 } 609 610 /* 611 * Check if the word at "mip->mi_word" is in the tree. 612 * When "mode" is FIND_FOLDWORD check in fold-case word tree. 613 * When "mode" is FIND_KEEPWORD check in keep-case word tree. 614 * When "mode" is FIND_PREFIX check for word after prefix in fold-case word 615 * tree. 616 * 617 * For a match mip->mi_result is updated. 618 */ 619 static void 620 find_word(matchinf_T *mip, int mode) 621 { 622 idx_T arridx = 0; 623 int endlen[MAXWLEN]; /* length at possible word endings */ 624 idx_T endidx[MAXWLEN]; /* possible word endings */ 625 int endidxcnt = 0; 626 int len; 627 int wlen = 0; 628 int flen; 629 int c; 630 char_u *ptr; 631 idx_T lo, hi, m; 632 char_u *s; 633 char_u *p; 634 int res = SP_BAD; 635 slang_T *slang = mip->mi_lp->lp_slang; 636 unsigned flags; 637 char_u *byts; 638 idx_T *idxs; 639 int word_ends; 640 int prefix_found; 641 int nobreak_result; 642 643 if (mode == FIND_KEEPWORD || mode == FIND_KEEPCOMPOUND) 644 { 645 /* Check for word with matching case in keep-case tree. */ 646 ptr = mip->mi_word; 647 flen = 9999; /* no case folding, always enough bytes */ 648 byts = slang->sl_kbyts; 649 idxs = slang->sl_kidxs; 650 651 if (mode == FIND_KEEPCOMPOUND) 652 /* Skip over the previously found word(s). */ 653 wlen += mip->mi_compoff; 654 } 655 else 656 { 657 /* Check for case-folded in case-folded tree. */ 658 ptr = mip->mi_fword; 659 flen = mip->mi_fwordlen; /* available case-folded bytes */ 660 byts = slang->sl_fbyts; 661 idxs = slang->sl_fidxs; 662 663 if (mode == FIND_PREFIX) 664 { 665 /* Skip over the prefix. */ 666 wlen = mip->mi_prefixlen; 667 flen -= mip->mi_prefixlen; 668 } 669 else if (mode == FIND_COMPOUND) 670 { 671 /* Skip over the previously found word(s). */ 672 wlen = mip->mi_compoff; 673 flen -= mip->mi_compoff; 674 } 675 676 } 677 678 if (byts == NULL) 679 return; /* array is empty */ 680 681 /* 682 * Repeat advancing in the tree until: 683 * - there is a byte that doesn't match, 684 * - we reach the end of the tree, 685 * - or we reach the end of the line. 686 */ 687 for (;;) 688 { 689 if (flen <= 0 && *mip->mi_fend != NUL) 690 flen = fold_more(mip); 691 692 len = byts[arridx++]; 693 694 /* If the first possible byte is a zero the word could end here. 695 * Remember this index, we first check for the longest word. */ 696 if (byts[arridx] == 0) 697 { 698 if (endidxcnt == MAXWLEN) 699 { 700 /* Must be a corrupted spell file. */ 701 emsg(_(e_format)); 702 return; 703 } 704 endlen[endidxcnt] = wlen; 705 endidx[endidxcnt++] = arridx++; 706 --len; 707 708 /* Skip over the zeros, there can be several flag/region 709 * combinations. */ 710 while (len > 0 && byts[arridx] == 0) 711 { 712 ++arridx; 713 --len; 714 } 715 if (len == 0) 716 break; /* no children, word must end here */ 717 } 718 719 /* Stop looking at end of the line. */ 720 if (ptr[wlen] == NUL) 721 break; 722 723 /* Perform a binary search in the list of accepted bytes. */ 724 c = ptr[wlen]; 725 if (c == TAB) /* <Tab> is handled like <Space> */ 726 c = ' '; 727 lo = arridx; 728 hi = arridx + len - 1; 729 while (lo < hi) 730 { 731 m = (lo + hi) / 2; 732 if (byts[m] > c) 733 hi = m - 1; 734 else if (byts[m] < c) 735 lo = m + 1; 736 else 737 { 738 lo = hi = m; 739 break; 740 } 741 } 742 743 /* Stop if there is no matching byte. */ 744 if (hi < lo || byts[lo] != c) 745 break; 746 747 /* Continue at the child (if there is one). */ 748 arridx = idxs[lo]; 749 ++wlen; 750 --flen; 751 752 /* One space in the good word may stand for several spaces in the 753 * checked word. */ 754 if (c == ' ') 755 { 756 for (;;) 757 { 758 if (flen <= 0 && *mip->mi_fend != NUL) 759 flen = fold_more(mip); 760 if (ptr[wlen] != ' ' && ptr[wlen] != TAB) 761 break; 762 ++wlen; 763 --flen; 764 } 765 } 766 } 767 768 /* 769 * Verify that one of the possible endings is valid. Try the longest 770 * first. 771 */ 772 while (endidxcnt > 0) 773 { 774 --endidxcnt; 775 arridx = endidx[endidxcnt]; 776 wlen = endlen[endidxcnt]; 777 778 if ((*mb_head_off)(ptr, ptr + wlen) > 0) 779 continue; /* not at first byte of character */ 780 if (spell_iswordp(ptr + wlen, mip->mi_win)) 781 { 782 if (slang->sl_compprog == NULL && !slang->sl_nobreak) 783 continue; /* next char is a word character */ 784 word_ends = FALSE; 785 } 786 else 787 word_ends = TRUE; 788 /* The prefix flag is before compound flags. Once a valid prefix flag 789 * has been found we try compound flags. */ 790 prefix_found = FALSE; 791 792 if (mode != FIND_KEEPWORD && has_mbyte) 793 { 794 /* Compute byte length in original word, length may change 795 * when folding case. This can be slow, take a shortcut when the 796 * case-folded word is equal to the keep-case word. */ 797 p = mip->mi_word; 798 if (STRNCMP(ptr, p, wlen) != 0) 799 { 800 for (s = ptr; s < ptr + wlen; MB_PTR_ADV(s)) 801 MB_PTR_ADV(p); 802 wlen = (int)(p - mip->mi_word); 803 } 804 } 805 806 /* Check flags and region. For FIND_PREFIX check the condition and 807 * prefix ID. 808 * Repeat this if there are more flags/region alternatives until there 809 * is a match. */ 810 res = SP_BAD; 811 for (len = byts[arridx - 1]; len > 0 && byts[arridx] == 0; 812 --len, ++arridx) 813 { 814 flags = idxs[arridx]; 815 816 /* For the fold-case tree check that the case of the checked word 817 * matches with what the word in the tree requires. 818 * For keep-case tree the case is always right. For prefixes we 819 * don't bother to check. */ 820 if (mode == FIND_FOLDWORD) 821 { 822 if (mip->mi_cend != mip->mi_word + wlen) 823 { 824 /* mi_capflags was set for a different word length, need 825 * to do it again. */ 826 mip->mi_cend = mip->mi_word + wlen; 827 mip->mi_capflags = captype(mip->mi_word, mip->mi_cend); 828 } 829 830 if (mip->mi_capflags == WF_KEEPCAP 831 || !spell_valid_case(mip->mi_capflags, flags)) 832 continue; 833 } 834 835 /* When mode is FIND_PREFIX the word must support the prefix: 836 * check the prefix ID and the condition. Do that for the list at 837 * mip->mi_prefarridx that find_prefix() filled. */ 838 else if (mode == FIND_PREFIX && !prefix_found) 839 { 840 c = valid_word_prefix(mip->mi_prefcnt, mip->mi_prefarridx, 841 flags, 842 mip->mi_word + mip->mi_cprefixlen, slang, 843 FALSE); 844 if (c == 0) 845 continue; 846 847 /* Use the WF_RARE flag for a rare prefix. */ 848 if (c & WF_RAREPFX) 849 flags |= WF_RARE; 850 prefix_found = TRUE; 851 } 852 853 if (slang->sl_nobreak) 854 { 855 if ((mode == FIND_COMPOUND || mode == FIND_KEEPCOMPOUND) 856 && (flags & WF_BANNED) == 0) 857 { 858 /* NOBREAK: found a valid following word. That's all we 859 * need to know, so return. */ 860 mip->mi_result = SP_OK; 861 break; 862 } 863 } 864 865 else if ((mode == FIND_COMPOUND || mode == FIND_KEEPCOMPOUND 866 || !word_ends)) 867 { 868 /* If there is no compound flag or the word is shorter than 869 * COMPOUNDMIN reject it quickly. 870 * Makes you wonder why someone puts a compound flag on a word 871 * that's too short... Myspell compatibility requires this 872 * anyway. */ 873 if (((unsigned)flags >> 24) == 0 874 || wlen - mip->mi_compoff < slang->sl_compminlen) 875 continue; 876 /* For multi-byte chars check character length against 877 * COMPOUNDMIN. */ 878 if (has_mbyte 879 && slang->sl_compminlen > 0 880 && mb_charlen_len(mip->mi_word + mip->mi_compoff, 881 wlen - mip->mi_compoff) < slang->sl_compminlen) 882 continue; 883 884 /* Limit the number of compound words to COMPOUNDWORDMAX if no 885 * maximum for syllables is specified. */ 886 if (!word_ends && mip->mi_complen + mip->mi_compextra + 2 887 > slang->sl_compmax 888 && slang->sl_compsylmax == MAXWLEN) 889 continue; 890 891 /* Don't allow compounding on a side where an affix was added, 892 * unless COMPOUNDPERMITFLAG was used. */ 893 if (mip->mi_complen > 0 && (flags & WF_NOCOMPBEF)) 894 continue; 895 if (!word_ends && (flags & WF_NOCOMPAFT)) 896 continue; 897 898 /* Quickly check if compounding is possible with this flag. */ 899 if (!byte_in_str(mip->mi_complen == 0 900 ? slang->sl_compstartflags 901 : slang->sl_compallflags, 902 ((unsigned)flags >> 24))) 903 continue; 904 905 /* If there is a match with a CHECKCOMPOUNDPATTERN rule 906 * discard the compound word. */ 907 if (match_checkcompoundpattern(ptr, wlen, &slang->sl_comppat)) 908 continue; 909 910 if (mode == FIND_COMPOUND) 911 { 912 int capflags; 913 914 /* Need to check the caps type of the appended compound 915 * word. */ 916 if (has_mbyte && STRNCMP(ptr, mip->mi_word, 917 mip->mi_compoff) != 0) 918 { 919 /* case folding may have changed the length */ 920 p = mip->mi_word; 921 for (s = ptr; s < ptr + mip->mi_compoff; MB_PTR_ADV(s)) 922 MB_PTR_ADV(p); 923 } 924 else 925 p = mip->mi_word + mip->mi_compoff; 926 capflags = captype(p, mip->mi_word + wlen); 927 if (capflags == WF_KEEPCAP || (capflags == WF_ALLCAP 928 && (flags & WF_FIXCAP) != 0)) 929 continue; 930 931 if (capflags != WF_ALLCAP) 932 { 933 /* When the character before the word is a word 934 * character we do not accept a Onecap word. We do 935 * accept a no-caps word, even when the dictionary 936 * word specifies ONECAP. */ 937 MB_PTR_BACK(mip->mi_word, p); 938 if (spell_iswordp_nmw(p, mip->mi_win) 939 ? capflags == WF_ONECAP 940 : (flags & WF_ONECAP) != 0 941 && capflags != WF_ONECAP) 942 continue; 943 } 944 } 945 946 /* If the word ends the sequence of compound flags of the 947 * words must match with one of the COMPOUNDRULE items and 948 * the number of syllables must not be too large. */ 949 mip->mi_compflags[mip->mi_complen] = ((unsigned)flags >> 24); 950 mip->mi_compflags[mip->mi_complen + 1] = NUL; 951 if (word_ends) 952 { 953 char_u fword[MAXWLEN]; 954 955 if (slang->sl_compsylmax < MAXWLEN) 956 { 957 /* "fword" is only needed for checking syllables. */ 958 if (ptr == mip->mi_word) 959 (void)spell_casefold(ptr, wlen, fword, MAXWLEN); 960 else 961 vim_strncpy(fword, ptr, endlen[endidxcnt]); 962 } 963 if (!can_compound(slang, fword, mip->mi_compflags)) 964 continue; 965 } 966 else if (slang->sl_comprules != NULL 967 && !match_compoundrule(slang, mip->mi_compflags)) 968 /* The compound flags collected so far do not match any 969 * COMPOUNDRULE, discard the compounded word. */ 970 continue; 971 } 972 973 /* Check NEEDCOMPOUND: can't use word without compounding. */ 974 else if (flags & WF_NEEDCOMP) 975 continue; 976 977 nobreak_result = SP_OK; 978 979 if (!word_ends) 980 { 981 int save_result = mip->mi_result; 982 char_u *save_end = mip->mi_end; 983 langp_T *save_lp = mip->mi_lp; 984 int lpi; 985 986 /* Check that a valid word follows. If there is one and we 987 * are compounding, it will set "mi_result", thus we are 988 * always finished here. For NOBREAK we only check that a 989 * valid word follows. 990 * Recursive! */ 991 if (slang->sl_nobreak) 992 mip->mi_result = SP_BAD; 993 994 /* Find following word in case-folded tree. */ 995 mip->mi_compoff = endlen[endidxcnt]; 996 if (has_mbyte && mode == FIND_KEEPWORD) 997 { 998 /* Compute byte length in case-folded word from "wlen": 999 * byte length in keep-case word. Length may change when 1000 * folding case. This can be slow, take a shortcut when 1001 * the case-folded word is equal to the keep-case word. */ 1002 p = mip->mi_fword; 1003 if (STRNCMP(ptr, p, wlen) != 0) 1004 { 1005 for (s = ptr; s < ptr + wlen; MB_PTR_ADV(s)) 1006 MB_PTR_ADV(p); 1007 mip->mi_compoff = (int)(p - mip->mi_fword); 1008 } 1009 } 1010 #if 0 /* Disabled, see below */ 1011 c = mip->mi_compoff; 1012 #endif 1013 ++mip->mi_complen; 1014 if (flags & WF_COMPROOT) 1015 ++mip->mi_compextra; 1016 1017 /* For NOBREAK we need to try all NOBREAK languages, at least 1018 * to find the ".add" file(s). */ 1019 for (lpi = 0; lpi < mip->mi_win->w_s->b_langp.ga_len; ++lpi) 1020 { 1021 if (slang->sl_nobreak) 1022 { 1023 mip->mi_lp = LANGP_ENTRY(mip->mi_win->w_s->b_langp, lpi); 1024 if (mip->mi_lp->lp_slang->sl_fidxs == NULL 1025 || !mip->mi_lp->lp_slang->sl_nobreak) 1026 continue; 1027 } 1028 1029 find_word(mip, FIND_COMPOUND); 1030 1031 /* When NOBREAK any word that matches is OK. Otherwise we 1032 * need to find the longest match, thus try with keep-case 1033 * and prefix too. */ 1034 if (!slang->sl_nobreak || mip->mi_result == SP_BAD) 1035 { 1036 /* Find following word in keep-case tree. */ 1037 mip->mi_compoff = wlen; 1038 find_word(mip, FIND_KEEPCOMPOUND); 1039 1040 #if 0 /* Disabled, a prefix must not appear halfway a compound word, 1041 unless the COMPOUNDPERMITFLAG is used and then it can't be a 1042 postponed prefix. */ 1043 if (!slang->sl_nobreak || mip->mi_result == SP_BAD) 1044 { 1045 /* Check for following word with prefix. */ 1046 mip->mi_compoff = c; 1047 find_prefix(mip, FIND_COMPOUND); 1048 } 1049 #endif 1050 } 1051 1052 if (!slang->sl_nobreak) 1053 break; 1054 } 1055 --mip->mi_complen; 1056 if (flags & WF_COMPROOT) 1057 --mip->mi_compextra; 1058 mip->mi_lp = save_lp; 1059 1060 if (slang->sl_nobreak) 1061 { 1062 nobreak_result = mip->mi_result; 1063 mip->mi_result = save_result; 1064 mip->mi_end = save_end; 1065 } 1066 else 1067 { 1068 if (mip->mi_result == SP_OK) 1069 break; 1070 continue; 1071 } 1072 } 1073 1074 if (flags & WF_BANNED) 1075 res = SP_BANNED; 1076 else if (flags & WF_REGION) 1077 { 1078 /* Check region. */ 1079 if ((mip->mi_lp->lp_region & (flags >> 16)) != 0) 1080 res = SP_OK; 1081 else 1082 res = SP_LOCAL; 1083 } 1084 else if (flags & WF_RARE) 1085 res = SP_RARE; 1086 else 1087 res = SP_OK; 1088 1089 /* Always use the longest match and the best result. For NOBREAK 1090 * we separately keep the longest match without a following good 1091 * word as a fall-back. */ 1092 if (nobreak_result == SP_BAD) 1093 { 1094 if (mip->mi_result2 > res) 1095 { 1096 mip->mi_result2 = res; 1097 mip->mi_end2 = mip->mi_word + wlen; 1098 } 1099 else if (mip->mi_result2 == res 1100 && mip->mi_end2 < mip->mi_word + wlen) 1101 mip->mi_end2 = mip->mi_word + wlen; 1102 } 1103 else if (mip->mi_result > res) 1104 { 1105 mip->mi_result = res; 1106 mip->mi_end = mip->mi_word + wlen; 1107 } 1108 else if (mip->mi_result == res && mip->mi_end < mip->mi_word + wlen) 1109 mip->mi_end = mip->mi_word + wlen; 1110 1111 if (mip->mi_result == SP_OK) 1112 break; 1113 } 1114 1115 if (mip->mi_result == SP_OK) 1116 break; 1117 } 1118 } 1119 1120 /* 1121 * Return TRUE if there is a match between the word ptr[wlen] and 1122 * CHECKCOMPOUNDPATTERN rules, assuming that we will concatenate with another 1123 * word. 1124 * A match means that the first part of CHECKCOMPOUNDPATTERN matches at the 1125 * end of ptr[wlen] and the second part matches after it. 1126 */ 1127 static int 1128 match_checkcompoundpattern( 1129 char_u *ptr, 1130 int wlen, 1131 garray_T *gap) /* &sl_comppat */ 1132 { 1133 int i; 1134 char_u *p; 1135 int len; 1136 1137 for (i = 0; i + 1 < gap->ga_len; i += 2) 1138 { 1139 p = ((char_u **)gap->ga_data)[i + 1]; 1140 if (STRNCMP(ptr + wlen, p, STRLEN(p)) == 0) 1141 { 1142 /* Second part matches at start of following compound word, now 1143 * check if first part matches at end of previous word. */ 1144 p = ((char_u **)gap->ga_data)[i]; 1145 len = (int)STRLEN(p); 1146 if (len <= wlen && STRNCMP(ptr + wlen - len, p, len) == 0) 1147 return TRUE; 1148 } 1149 } 1150 return FALSE; 1151 } 1152 1153 /* 1154 * Return TRUE if "flags" is a valid sequence of compound flags and "word" 1155 * does not have too many syllables. 1156 */ 1157 static int 1158 can_compound(slang_T *slang, char_u *word, char_u *flags) 1159 { 1160 char_u uflags[MAXWLEN * 2]; 1161 int i; 1162 char_u *p; 1163 1164 if (slang->sl_compprog == NULL) 1165 return FALSE; 1166 if (enc_utf8) 1167 { 1168 /* Need to convert the single byte flags to utf8 characters. */ 1169 p = uflags; 1170 for (i = 0; flags[i] != NUL; ++i) 1171 p += utf_char2bytes(flags[i], p); 1172 *p = NUL; 1173 p = uflags; 1174 } 1175 else 1176 p = flags; 1177 if (!vim_regexec_prog(&slang->sl_compprog, FALSE, p, 0)) 1178 return FALSE; 1179 1180 /* Count the number of syllables. This may be slow, do it last. If there 1181 * are too many syllables AND the number of compound words is above 1182 * COMPOUNDWORDMAX then compounding is not allowed. */ 1183 if (slang->sl_compsylmax < MAXWLEN 1184 && count_syllables(slang, word) > slang->sl_compsylmax) 1185 return (int)STRLEN(flags) < slang->sl_compmax; 1186 return TRUE; 1187 } 1188 1189 /* 1190 * Return TRUE when the sequence of flags in "compflags" plus "flag" can 1191 * possibly form a valid compounded word. This also checks the COMPOUNDRULE 1192 * lines if they don't contain wildcards. 1193 */ 1194 static int 1195 can_be_compound( 1196 trystate_T *sp, 1197 slang_T *slang, 1198 char_u *compflags, 1199 int flag) 1200 { 1201 /* If the flag doesn't appear in sl_compstartflags or sl_compallflags 1202 * then it can't possibly compound. */ 1203 if (!byte_in_str(sp->ts_complen == sp->ts_compsplit 1204 ? slang->sl_compstartflags : slang->sl_compallflags, flag)) 1205 return FALSE; 1206 1207 /* If there are no wildcards, we can check if the flags collected so far 1208 * possibly can form a match with COMPOUNDRULE patterns. This only 1209 * makes sense when we have two or more words. */ 1210 if (slang->sl_comprules != NULL && sp->ts_complen > sp->ts_compsplit) 1211 { 1212 int v; 1213 1214 compflags[sp->ts_complen] = flag; 1215 compflags[sp->ts_complen + 1] = NUL; 1216 v = match_compoundrule(slang, compflags + sp->ts_compsplit); 1217 compflags[sp->ts_complen] = NUL; 1218 return v; 1219 } 1220 1221 return TRUE; 1222 } 1223 1224 1225 /* 1226 * Return TRUE if the compound flags in compflags[] match the start of any 1227 * compound rule. This is used to stop trying a compound if the flags 1228 * collected so far can't possibly match any compound rule. 1229 * Caller must check that slang->sl_comprules is not NULL. 1230 */ 1231 static int 1232 match_compoundrule(slang_T *slang, char_u *compflags) 1233 { 1234 char_u *p; 1235 int i; 1236 int c; 1237 1238 /* loop over all the COMPOUNDRULE entries */ 1239 for (p = slang->sl_comprules; *p != NUL; ++p) 1240 { 1241 /* loop over the flags in the compound word we have made, match 1242 * them against the current rule entry */ 1243 for (i = 0; ; ++i) 1244 { 1245 c = compflags[i]; 1246 if (c == NUL) 1247 /* found a rule that matches for the flags we have so far */ 1248 return TRUE; 1249 if (*p == '/' || *p == NUL) 1250 break; /* end of rule, it's too short */ 1251 if (*p == '[') 1252 { 1253 int match = FALSE; 1254 1255 /* compare against all the flags in [] */ 1256 ++p; 1257 while (*p != ']' && *p != NUL) 1258 if (*p++ == c) 1259 match = TRUE; 1260 if (!match) 1261 break; /* none matches */ 1262 } 1263 else if (*p != c) 1264 break; /* flag of word doesn't match flag in pattern */ 1265 ++p; 1266 } 1267 1268 /* Skip to the next "/", where the next pattern starts. */ 1269 p = vim_strchr(p, '/'); 1270 if (p == NULL) 1271 break; 1272 } 1273 1274 /* Checked all the rules and none of them match the flags, so there 1275 * can't possibly be a compound starting with these flags. */ 1276 return FALSE; 1277 } 1278 1279 /* 1280 * Return non-zero if the prefix indicated by "arridx" matches with the prefix 1281 * ID in "flags" for the word "word". 1282 * The WF_RAREPFX flag is included in the return value for a rare prefix. 1283 */ 1284 static int 1285 valid_word_prefix( 1286 int totprefcnt, /* nr of prefix IDs */ 1287 int arridx, /* idx in sl_pidxs[] */ 1288 int flags, 1289 char_u *word, 1290 slang_T *slang, 1291 int cond_req) /* only use prefixes with a condition */ 1292 { 1293 int prefcnt; 1294 int pidx; 1295 regprog_T **rp; 1296 int prefid; 1297 1298 prefid = (unsigned)flags >> 24; 1299 for (prefcnt = totprefcnt - 1; prefcnt >= 0; --prefcnt) 1300 { 1301 pidx = slang->sl_pidxs[arridx + prefcnt]; 1302 1303 /* Check the prefix ID. */ 1304 if (prefid != (pidx & 0xff)) 1305 continue; 1306 1307 /* Check if the prefix doesn't combine and the word already has a 1308 * suffix. */ 1309 if ((flags & WF_HAS_AFF) && (pidx & WF_PFX_NC)) 1310 continue; 1311 1312 /* Check the condition, if there is one. The condition index is 1313 * stored in the two bytes above the prefix ID byte. */ 1314 rp = &slang->sl_prefprog[((unsigned)pidx >> 8) & 0xffff]; 1315 if (*rp != NULL) 1316 { 1317 if (!vim_regexec_prog(rp, FALSE, word, 0)) 1318 continue; 1319 } 1320 else if (cond_req) 1321 continue; 1322 1323 /* It's a match! Return the WF_ flags. */ 1324 return pidx; 1325 } 1326 return 0; 1327 } 1328 1329 /* 1330 * Check if the word at "mip->mi_word" has a matching prefix. 1331 * If it does, then check the following word. 1332 * 1333 * If "mode" is "FIND_COMPOUND" then do the same after another word, find a 1334 * prefix in a compound word. 1335 * 1336 * For a match mip->mi_result is updated. 1337 */ 1338 static void 1339 find_prefix(matchinf_T *mip, int mode) 1340 { 1341 idx_T arridx = 0; 1342 int len; 1343 int wlen = 0; 1344 int flen; 1345 int c; 1346 char_u *ptr; 1347 idx_T lo, hi, m; 1348 slang_T *slang = mip->mi_lp->lp_slang; 1349 char_u *byts; 1350 idx_T *idxs; 1351 1352 byts = slang->sl_pbyts; 1353 if (byts == NULL) 1354 return; /* array is empty */ 1355 1356 /* We use the case-folded word here, since prefixes are always 1357 * case-folded. */ 1358 ptr = mip->mi_fword; 1359 flen = mip->mi_fwordlen; /* available case-folded bytes */ 1360 if (mode == FIND_COMPOUND) 1361 { 1362 /* Skip over the previously found word(s). */ 1363 ptr += mip->mi_compoff; 1364 flen -= mip->mi_compoff; 1365 } 1366 idxs = slang->sl_pidxs; 1367 1368 /* 1369 * Repeat advancing in the tree until: 1370 * - there is a byte that doesn't match, 1371 * - we reach the end of the tree, 1372 * - or we reach the end of the line. 1373 */ 1374 for (;;) 1375 { 1376 if (flen == 0 && *mip->mi_fend != NUL) 1377 flen = fold_more(mip); 1378 1379 len = byts[arridx++]; 1380 1381 /* If the first possible byte is a zero the prefix could end here. 1382 * Check if the following word matches and supports the prefix. */ 1383 if (byts[arridx] == 0) 1384 { 1385 /* There can be several prefixes with different conditions. We 1386 * try them all, since we don't know which one will give the 1387 * longest match. The word is the same each time, pass the list 1388 * of possible prefixes to find_word(). */ 1389 mip->mi_prefarridx = arridx; 1390 mip->mi_prefcnt = len; 1391 while (len > 0 && byts[arridx] == 0) 1392 { 1393 ++arridx; 1394 --len; 1395 } 1396 mip->mi_prefcnt -= len; 1397 1398 /* Find the word that comes after the prefix. */ 1399 mip->mi_prefixlen = wlen; 1400 if (mode == FIND_COMPOUND) 1401 /* Skip over the previously found word(s). */ 1402 mip->mi_prefixlen += mip->mi_compoff; 1403 1404 if (has_mbyte) 1405 { 1406 /* Case-folded length may differ from original length. */ 1407 mip->mi_cprefixlen = nofold_len(mip->mi_fword, 1408 mip->mi_prefixlen, mip->mi_word); 1409 } 1410 else 1411 mip->mi_cprefixlen = mip->mi_prefixlen; 1412 find_word(mip, FIND_PREFIX); 1413 1414 1415 if (len == 0) 1416 break; /* no children, word must end here */ 1417 } 1418 1419 /* Stop looking at end of the line. */ 1420 if (ptr[wlen] == NUL) 1421 break; 1422 1423 /* Perform a binary search in the list of accepted bytes. */ 1424 c = ptr[wlen]; 1425 lo = arridx; 1426 hi = arridx + len - 1; 1427 while (lo < hi) 1428 { 1429 m = (lo + hi) / 2; 1430 if (byts[m] > c) 1431 hi = m - 1; 1432 else if (byts[m] < c) 1433 lo = m + 1; 1434 else 1435 { 1436 lo = hi = m; 1437 break; 1438 } 1439 } 1440 1441 /* Stop if there is no matching byte. */ 1442 if (hi < lo || byts[lo] != c) 1443 break; 1444 1445 /* Continue at the child (if there is one). */ 1446 arridx = idxs[lo]; 1447 ++wlen; 1448 --flen; 1449 } 1450 } 1451 1452 /* 1453 * Need to fold at least one more character. Do until next non-word character 1454 * for efficiency. Include the non-word character too. 1455 * Return the length of the folded chars in bytes. 1456 */ 1457 static int 1458 fold_more(matchinf_T *mip) 1459 { 1460 int flen; 1461 char_u *p; 1462 1463 p = mip->mi_fend; 1464 do 1465 MB_PTR_ADV(mip->mi_fend); 1466 while (*mip->mi_fend != NUL && spell_iswordp(mip->mi_fend, mip->mi_win)); 1467 1468 /* Include the non-word character so that we can check for the word end. */ 1469 if (*mip->mi_fend != NUL) 1470 MB_PTR_ADV(mip->mi_fend); 1471 1472 (void)spell_casefold(p, (int)(mip->mi_fend - p), 1473 mip->mi_fword + mip->mi_fwordlen, 1474 MAXWLEN - mip->mi_fwordlen); 1475 flen = (int)STRLEN(mip->mi_fword + mip->mi_fwordlen); 1476 mip->mi_fwordlen += flen; 1477 return flen; 1478 } 1479 1480 /* 1481 * Check case flags for a word. Return TRUE if the word has the requested 1482 * case. 1483 */ 1484 static int 1485 spell_valid_case( 1486 int wordflags, /* flags for the checked word. */ 1487 int treeflags) /* flags for the word in the spell tree */ 1488 { 1489 return ((wordflags == WF_ALLCAP && (treeflags & WF_FIXCAP) == 0) 1490 || ((treeflags & (WF_ALLCAP | WF_KEEPCAP)) == 0 1491 && ((treeflags & WF_ONECAP) == 0 1492 || (wordflags & WF_ONECAP) != 0))); 1493 } 1494 1495 /* 1496 * Return TRUE if spell checking is not enabled. 1497 */ 1498 static int 1499 no_spell_checking(win_T *wp) 1500 { 1501 if (!wp->w_p_spell || *wp->w_s->b_p_spl == NUL 1502 || wp->w_s->b_langp.ga_len == 0) 1503 { 1504 emsg(_("E756: Spell checking is not enabled")); 1505 return TRUE; 1506 } 1507 return FALSE; 1508 } 1509 1510 /* 1511 * Move to next spell error. 1512 * "curline" is FALSE for "[s", "]s", "[S" and "]S". 1513 * "curline" is TRUE to find word under/after cursor in the same line. 1514 * For Insert mode completion "dir" is BACKWARD and "curline" is TRUE: move 1515 * to after badly spelled word before the cursor. 1516 * Return 0 if not found, length of the badly spelled word otherwise. 1517 */ 1518 int 1519 spell_move_to( 1520 win_T *wp, 1521 int dir, /* FORWARD or BACKWARD */ 1522 int allwords, /* TRUE for "[s"/"]s", FALSE for "[S"/"]S" */ 1523 int curline, 1524 hlf_T *attrp) /* return: attributes of bad word or NULL 1525 (only when "dir" is FORWARD) */ 1526 { 1527 linenr_T lnum; 1528 pos_T found_pos; 1529 int found_len = 0; 1530 char_u *line; 1531 char_u *p; 1532 char_u *endp; 1533 hlf_T attr; 1534 int len; 1535 #ifdef FEAT_SYN_HL 1536 int has_syntax = syntax_present(wp); 1537 #endif 1538 int col; 1539 int can_spell; 1540 char_u *buf = NULL; 1541 int buflen = 0; 1542 int skip = 0; 1543 int capcol = -1; 1544 int found_one = FALSE; 1545 int wrapped = FALSE; 1546 1547 if (no_spell_checking(wp)) 1548 return 0; 1549 1550 /* 1551 * Start looking for bad word at the start of the line, because we can't 1552 * start halfway a word, we don't know where it starts or ends. 1553 * 1554 * When searching backwards, we continue in the line to find the last 1555 * bad word (in the cursor line: before the cursor). 1556 * 1557 * We concatenate the start of the next line, so that wrapped words work 1558 * (e.g. "et<line-break>cetera"). Doesn't work when searching backwards 1559 * though... 1560 */ 1561 lnum = wp->w_cursor.lnum; 1562 CLEAR_POS(&found_pos); 1563 1564 while (!got_int) 1565 { 1566 line = ml_get_buf(wp->w_buffer, lnum, FALSE); 1567 1568 len = (int)STRLEN(line); 1569 if (buflen < len + MAXWLEN + 2) 1570 { 1571 vim_free(buf); 1572 buflen = len + MAXWLEN + 2; 1573 buf = alloc(buflen); 1574 if (buf == NULL) 1575 break; 1576 } 1577 1578 /* In first line check first word for Capital. */ 1579 if (lnum == 1) 1580 capcol = 0; 1581 1582 /* For checking first word with a capital skip white space. */ 1583 if (capcol == 0) 1584 capcol = getwhitecols(line); 1585 else if (curline && wp == curwin) 1586 { 1587 /* For spellbadword(): check if first word needs a capital. */ 1588 col = getwhitecols(line); 1589 if (check_need_cap(lnum, col)) 1590 capcol = col; 1591 1592 /* Need to get the line again, may have looked at the previous 1593 * one. */ 1594 line = ml_get_buf(wp->w_buffer, lnum, FALSE); 1595 } 1596 1597 /* Copy the line into "buf" and append the start of the next line if 1598 * possible. */ 1599 STRCPY(buf, line); 1600 if (lnum < wp->w_buffer->b_ml.ml_line_count) 1601 spell_cat_line(buf + STRLEN(buf), 1602 ml_get_buf(wp->w_buffer, lnum + 1, FALSE), MAXWLEN); 1603 1604 p = buf + skip; 1605 endp = buf + len; 1606 while (p < endp) 1607 { 1608 /* When searching backward don't search after the cursor. Unless 1609 * we wrapped around the end of the buffer. */ 1610 if (dir == BACKWARD 1611 && lnum == wp->w_cursor.lnum 1612 && !wrapped 1613 && (colnr_T)(p - buf) >= wp->w_cursor.col) 1614 break; 1615 1616 /* start of word */ 1617 attr = HLF_COUNT; 1618 len = spell_check(wp, p, &attr, &capcol, FALSE); 1619 1620 if (attr != HLF_COUNT) 1621 { 1622 /* We found a bad word. Check the attribute. */ 1623 if (allwords || attr == HLF_SPB) 1624 { 1625 /* When searching forward only accept a bad word after 1626 * the cursor. */ 1627 if (dir == BACKWARD 1628 || lnum != wp->w_cursor.lnum 1629 || (lnum == wp->w_cursor.lnum 1630 && (wrapped 1631 || (colnr_T)(curline ? p - buf + len 1632 : p - buf) 1633 > wp->w_cursor.col))) 1634 { 1635 #ifdef FEAT_SYN_HL 1636 if (has_syntax) 1637 { 1638 col = (int)(p - buf); 1639 (void)syn_get_id(wp, lnum, (colnr_T)col, 1640 FALSE, &can_spell, FALSE); 1641 if (!can_spell) 1642 attr = HLF_COUNT; 1643 } 1644 else 1645 #endif 1646 can_spell = TRUE; 1647 1648 if (can_spell) 1649 { 1650 found_one = TRUE; 1651 found_pos.lnum = lnum; 1652 found_pos.col = (int)(p - buf); 1653 found_pos.coladd = 0; 1654 if (dir == FORWARD) 1655 { 1656 /* No need to search further. */ 1657 wp->w_cursor = found_pos; 1658 vim_free(buf); 1659 if (attrp != NULL) 1660 *attrp = attr; 1661 return len; 1662 } 1663 else if (curline) 1664 /* Insert mode completion: put cursor after 1665 * the bad word. */ 1666 found_pos.col += len; 1667 found_len = len; 1668 } 1669 } 1670 else 1671 found_one = TRUE; 1672 } 1673 } 1674 1675 /* advance to character after the word */ 1676 p += len; 1677 capcol -= len; 1678 } 1679 1680 if (dir == BACKWARD && found_pos.lnum != 0) 1681 { 1682 /* Use the last match in the line (before the cursor). */ 1683 wp->w_cursor = found_pos; 1684 vim_free(buf); 1685 return found_len; 1686 } 1687 1688 if (curline) 1689 break; /* only check cursor line */ 1690 1691 /* If we are back at the starting line and searched it again there 1692 * is no match, give up. */ 1693 if (lnum == wp->w_cursor.lnum && wrapped) 1694 break; 1695 1696 /* Advance to next line. */ 1697 if (dir == BACKWARD) 1698 { 1699 if (lnum > 1) 1700 --lnum; 1701 else if (!p_ws) 1702 break; /* at first line and 'nowrapscan' */ 1703 else 1704 { 1705 /* Wrap around to the end of the buffer. May search the 1706 * starting line again and accept the last match. */ 1707 lnum = wp->w_buffer->b_ml.ml_line_count; 1708 wrapped = TRUE; 1709 if (!shortmess(SHM_SEARCH)) 1710 give_warning((char_u *)_(top_bot_msg), TRUE); 1711 } 1712 capcol = -1; 1713 } 1714 else 1715 { 1716 if (lnum < wp->w_buffer->b_ml.ml_line_count) 1717 ++lnum; 1718 else if (!p_ws) 1719 break; /* at first line and 'nowrapscan' */ 1720 else 1721 { 1722 /* Wrap around to the start of the buffer. May search the 1723 * starting line again and accept the first match. */ 1724 lnum = 1; 1725 wrapped = TRUE; 1726 if (!shortmess(SHM_SEARCH)) 1727 give_warning((char_u *)_(bot_top_msg), TRUE); 1728 } 1729 1730 /* If we are back at the starting line and there is no match then 1731 * give up. */ 1732 if (lnum == wp->w_cursor.lnum && !found_one) 1733 break; 1734 1735 /* Skip the characters at the start of the next line that were 1736 * included in a match crossing line boundaries. */ 1737 if (attr == HLF_COUNT) 1738 skip = (int)(p - endp); 1739 else 1740 skip = 0; 1741 1742 /* Capcol skips over the inserted space. */ 1743 --capcol; 1744 1745 /* But after empty line check first word in next line */ 1746 if (*skipwhite(line) == NUL) 1747 capcol = 0; 1748 } 1749 1750 line_breakcheck(); 1751 } 1752 1753 vim_free(buf); 1754 return 0; 1755 } 1756 1757 /* 1758 * For spell checking: concatenate the start of the following line "line" into 1759 * "buf", blanking-out special characters. Copy less then "maxlen" bytes. 1760 * Keep the blanks at the start of the next line, this is used in win_line() 1761 * to skip those bytes if the word was OK. 1762 */ 1763 void 1764 spell_cat_line(char_u *buf, char_u *line, int maxlen) 1765 { 1766 char_u *p; 1767 int n; 1768 1769 p = skipwhite(line); 1770 while (vim_strchr((char_u *)"*#/\"\t", *p) != NULL) 1771 p = skipwhite(p + 1); 1772 1773 if (*p != NUL) 1774 { 1775 /* Only worth concatenating if there is something else than spaces to 1776 * concatenate. */ 1777 n = (int)(p - line) + 1; 1778 if (n < maxlen - 1) 1779 { 1780 vim_memset(buf, ' ', n); 1781 vim_strncpy(buf + n, p, maxlen - 1 - n); 1782 } 1783 } 1784 } 1785 1786 /* 1787 * Structure used for the cookie argument of do_in_runtimepath(). 1788 */ 1789 typedef struct spelload_S 1790 { 1791 char_u sl_lang[MAXWLEN + 1]; /* language name */ 1792 slang_T *sl_slang; /* resulting slang_T struct */ 1793 int sl_nobreak; /* NOBREAK language found */ 1794 } spelload_T; 1795 1796 /* 1797 * Load word list(s) for "lang" from Vim spell file(s). 1798 * "lang" must be the language without the region: e.g., "en". 1799 */ 1800 static void 1801 spell_load_lang(char_u *lang) 1802 { 1803 char_u fname_enc[85]; 1804 int r; 1805 spelload_T sl; 1806 int round; 1807 1808 /* Copy the language name to pass it to spell_load_cb() as a cookie. 1809 * It's truncated when an error is detected. */ 1810 STRCPY(sl.sl_lang, lang); 1811 sl.sl_slang = NULL; 1812 sl.sl_nobreak = FALSE; 1813 1814 /* We may retry when no spell file is found for the language, an 1815 * autocommand may load it then. */ 1816 for (round = 1; round <= 2; ++round) 1817 { 1818 /* 1819 * Find the first spell file for "lang" in 'runtimepath' and load it. 1820 */ 1821 vim_snprintf((char *)fname_enc, sizeof(fname_enc) - 5, 1822 #ifdef VMS 1823 "spell/%s_%s.spl", 1824 #else 1825 "spell/%s.%s.spl", 1826 #endif 1827 lang, spell_enc()); 1828 r = do_in_runtimepath(fname_enc, 0, spell_load_cb, &sl); 1829 1830 if (r == FAIL && *sl.sl_lang != NUL) 1831 { 1832 /* Try loading the ASCII version. */ 1833 vim_snprintf((char *)fname_enc, sizeof(fname_enc) - 5, 1834 #ifdef VMS 1835 "spell/%s_ascii.spl", 1836 #else 1837 "spell/%s.ascii.spl", 1838 #endif 1839 lang); 1840 r = do_in_runtimepath(fname_enc, 0, spell_load_cb, &sl); 1841 1842 if (r == FAIL && *sl.sl_lang != NUL && round == 1 1843 && apply_autocmds(EVENT_SPELLFILEMISSING, lang, 1844 curbuf->b_fname, FALSE, curbuf)) 1845 continue; 1846 break; 1847 } 1848 break; 1849 } 1850 1851 if (r == FAIL) 1852 { 1853 smsg( 1854 #ifdef VMS 1855 _("Warning: Cannot find word list \"%s_%s.spl\" or \"%s_ascii.spl\""), 1856 #else 1857 _("Warning: Cannot find word list \"%s.%s.spl\" or \"%s.ascii.spl\""), 1858 #endif 1859 lang, spell_enc(), lang); 1860 } 1861 else if (sl.sl_slang != NULL) 1862 { 1863 /* At least one file was loaded, now load ALL the additions. */ 1864 STRCPY(fname_enc + STRLEN(fname_enc) - 3, "add.spl"); 1865 do_in_runtimepath(fname_enc, DIP_ALL, spell_load_cb, &sl); 1866 } 1867 } 1868 1869 /* 1870 * Return the encoding used for spell checking: Use 'encoding', except that we 1871 * use "latin1" for "latin9". And limit to 60 characters (just in case). 1872 */ 1873 char_u * 1874 spell_enc(void) 1875 { 1876 1877 if (STRLEN(p_enc) < 60 && STRCMP(p_enc, "iso-8859-15") != 0) 1878 return p_enc; 1879 return (char_u *)"latin1"; 1880 } 1881 1882 /* 1883 * Get the name of the .spl file for the internal wordlist into 1884 * "fname[MAXPATHL]". 1885 */ 1886 static void 1887 int_wordlist_spl(char_u *fname) 1888 { 1889 vim_snprintf((char *)fname, MAXPATHL, SPL_FNAME_TMPL, 1890 int_wordlist, spell_enc()); 1891 } 1892 1893 /* 1894 * Allocate a new slang_T for language "lang". "lang" can be NULL. 1895 * Caller must fill "sl_next". 1896 */ 1897 slang_T * 1898 slang_alloc(char_u *lang) 1899 { 1900 slang_T *lp; 1901 1902 lp = ALLOC_CLEAR_ONE(slang_T); 1903 if (lp != NULL) 1904 { 1905 if (lang != NULL) 1906 lp->sl_name = vim_strsave(lang); 1907 ga_init2(&lp->sl_rep, sizeof(fromto_T), 10); 1908 ga_init2(&lp->sl_repsal, sizeof(fromto_T), 10); 1909 lp->sl_compmax = MAXWLEN; 1910 lp->sl_compsylmax = MAXWLEN; 1911 hash_init(&lp->sl_wordcount); 1912 } 1913 1914 return lp; 1915 } 1916 1917 /* 1918 * Free the contents of an slang_T and the structure itself. 1919 */ 1920 void 1921 slang_free(slang_T *lp) 1922 { 1923 vim_free(lp->sl_name); 1924 vim_free(lp->sl_fname); 1925 slang_clear(lp); 1926 vim_free(lp); 1927 } 1928 1929 /* 1930 * Clear an slang_T so that the file can be reloaded. 1931 */ 1932 void 1933 slang_clear(slang_T *lp) 1934 { 1935 garray_T *gap; 1936 fromto_T *ftp; 1937 salitem_T *smp; 1938 int i; 1939 int round; 1940 1941 VIM_CLEAR(lp->sl_fbyts); 1942 VIM_CLEAR(lp->sl_kbyts); 1943 VIM_CLEAR(lp->sl_pbyts); 1944 1945 VIM_CLEAR(lp->sl_fidxs); 1946 VIM_CLEAR(lp->sl_kidxs); 1947 VIM_CLEAR(lp->sl_pidxs); 1948 1949 for (round = 1; round <= 2; ++round) 1950 { 1951 gap = round == 1 ? &lp->sl_rep : &lp->sl_repsal; 1952 while (gap->ga_len > 0) 1953 { 1954 ftp = &((fromto_T *)gap->ga_data)[--gap->ga_len]; 1955 vim_free(ftp->ft_from); 1956 vim_free(ftp->ft_to); 1957 } 1958 ga_clear(gap); 1959 } 1960 1961 gap = &lp->sl_sal; 1962 if (lp->sl_sofo) 1963 { 1964 /* "ga_len" is set to 1 without adding an item for latin1 */ 1965 if (gap->ga_data != NULL) 1966 /* SOFOFROM and SOFOTO items: free lists of wide characters. */ 1967 for (i = 0; i < gap->ga_len; ++i) 1968 vim_free(((int **)gap->ga_data)[i]); 1969 } 1970 else 1971 /* SAL items: free salitem_T items */ 1972 while (gap->ga_len > 0) 1973 { 1974 smp = &((salitem_T *)gap->ga_data)[--gap->ga_len]; 1975 vim_free(smp->sm_lead); 1976 /* Don't free sm_oneof and sm_rules, they point into sm_lead. */ 1977 vim_free(smp->sm_to); 1978 vim_free(smp->sm_lead_w); 1979 vim_free(smp->sm_oneof_w); 1980 vim_free(smp->sm_to_w); 1981 } 1982 ga_clear(gap); 1983 1984 for (i = 0; i < lp->sl_prefixcnt; ++i) 1985 vim_regfree(lp->sl_prefprog[i]); 1986 lp->sl_prefixcnt = 0; 1987 VIM_CLEAR(lp->sl_prefprog); 1988 1989 VIM_CLEAR(lp->sl_info); 1990 1991 VIM_CLEAR(lp->sl_midword); 1992 1993 vim_regfree(lp->sl_compprog); 1994 lp->sl_compprog = NULL; 1995 VIM_CLEAR(lp->sl_comprules); 1996 VIM_CLEAR(lp->sl_compstartflags); 1997 VIM_CLEAR(lp->sl_compallflags); 1998 1999 VIM_CLEAR(lp->sl_syllable); 2000 ga_clear(&lp->sl_syl_items); 2001 2002 ga_clear_strings(&lp->sl_comppat); 2003 2004 hash_clear_all(&lp->sl_wordcount, WC_KEY_OFF); 2005 hash_init(&lp->sl_wordcount); 2006 2007 hash_clear_all(&lp->sl_map_hash, 0); 2008 2009 /* Clear info from .sug file. */ 2010 slang_clear_sug(lp); 2011 2012 lp->sl_compmax = MAXWLEN; 2013 lp->sl_compminlen = 0; 2014 lp->sl_compsylmax = MAXWLEN; 2015 lp->sl_regions[0] = NUL; 2016 } 2017 2018 /* 2019 * Clear the info from the .sug file in "lp". 2020 */ 2021 void 2022 slang_clear_sug(slang_T *lp) 2023 { 2024 VIM_CLEAR(lp->sl_sbyts); 2025 VIM_CLEAR(lp->sl_sidxs); 2026 close_spellbuf(lp->sl_sugbuf); 2027 lp->sl_sugbuf = NULL; 2028 lp->sl_sugloaded = FALSE; 2029 lp->sl_sugtime = 0; 2030 } 2031 2032 /* 2033 * Load one spell file and store the info into a slang_T. 2034 * Invoked through do_in_runtimepath(). 2035 */ 2036 static void 2037 spell_load_cb(char_u *fname, void *cookie) 2038 { 2039 spelload_T *slp = (spelload_T *)cookie; 2040 slang_T *slang; 2041 2042 slang = spell_load_file(fname, slp->sl_lang, NULL, FALSE); 2043 if (slang != NULL) 2044 { 2045 /* When a previously loaded file has NOBREAK also use it for the 2046 * ".add" files. */ 2047 if (slp->sl_nobreak && slang->sl_add) 2048 slang->sl_nobreak = TRUE; 2049 else if (slang->sl_nobreak) 2050 slp->sl_nobreak = TRUE; 2051 2052 slp->sl_slang = slang; 2053 } 2054 } 2055 2056 2057 /* 2058 * Add a word to the hashtable of common words. 2059 * If it's already there then the counter is increased. 2060 */ 2061 void 2062 count_common_word( 2063 slang_T *lp, 2064 char_u *word, 2065 int len, /* word length, -1 for upto NUL */ 2066 int count) /* 1 to count once, 10 to init */ 2067 { 2068 hash_T hash; 2069 hashitem_T *hi; 2070 wordcount_T *wc; 2071 char_u buf[MAXWLEN]; 2072 char_u *p; 2073 2074 if (len == -1) 2075 p = word; 2076 else 2077 { 2078 vim_strncpy(buf, word, len); 2079 p = buf; 2080 } 2081 2082 hash = hash_hash(p); 2083 hi = hash_lookup(&lp->sl_wordcount, p, hash); 2084 if (HASHITEM_EMPTY(hi)) 2085 { 2086 wc = alloc(sizeof(wordcount_T) + STRLEN(p)); 2087 if (wc == NULL) 2088 return; 2089 STRCPY(wc->wc_word, p); 2090 wc->wc_count = count; 2091 hash_add_item(&lp->sl_wordcount, hi, wc->wc_word, hash); 2092 } 2093 else 2094 { 2095 wc = HI2WC(hi); 2096 if ((wc->wc_count += count) < (unsigned)count) /* check for overflow */ 2097 wc->wc_count = MAXWORDCOUNT; 2098 } 2099 } 2100 2101 /* 2102 * Adjust the score of common words. 2103 */ 2104 static int 2105 score_wordcount_adj( 2106 slang_T *slang, 2107 int score, 2108 char_u *word, 2109 int split) /* word was split, less bonus */ 2110 { 2111 hashitem_T *hi; 2112 wordcount_T *wc; 2113 int bonus; 2114 int newscore; 2115 2116 hi = hash_find(&slang->sl_wordcount, word); 2117 if (!HASHITEM_EMPTY(hi)) 2118 { 2119 wc = HI2WC(hi); 2120 if (wc->wc_count < SCORE_THRES2) 2121 bonus = SCORE_COMMON1; 2122 else if (wc->wc_count < SCORE_THRES3) 2123 bonus = SCORE_COMMON2; 2124 else 2125 bonus = SCORE_COMMON3; 2126 if (split) 2127 newscore = score - bonus / 2; 2128 else 2129 newscore = score - bonus; 2130 if (newscore < 0) 2131 return 0; 2132 return newscore; 2133 } 2134 return score; 2135 } 2136 2137 2138 /* 2139 * Return TRUE if byte "n" appears in "str". 2140 * Like strchr() but independent of locale. 2141 */ 2142 int 2143 byte_in_str(char_u *str, int n) 2144 { 2145 char_u *p; 2146 2147 for (p = str; *p != NUL; ++p) 2148 if (*p == n) 2149 return TRUE; 2150 return FALSE; 2151 } 2152 2153 #define SY_MAXLEN 30 2154 typedef struct syl_item_S 2155 { 2156 char_u sy_chars[SY_MAXLEN]; /* the sequence of chars */ 2157 int sy_len; 2158 } syl_item_T; 2159 2160 /* 2161 * Truncate "slang->sl_syllable" at the first slash and put the following items 2162 * in "slang->sl_syl_items". 2163 */ 2164 int 2165 init_syl_tab(slang_T *slang) 2166 { 2167 char_u *p; 2168 char_u *s; 2169 int l; 2170 syl_item_T *syl; 2171 2172 ga_init2(&slang->sl_syl_items, sizeof(syl_item_T), 4); 2173 p = vim_strchr(slang->sl_syllable, '/'); 2174 while (p != NULL) 2175 { 2176 *p++ = NUL; 2177 if (*p == NUL) /* trailing slash */ 2178 break; 2179 s = p; 2180 p = vim_strchr(p, '/'); 2181 if (p == NULL) 2182 l = (int)STRLEN(s); 2183 else 2184 l = (int)(p - s); 2185 if (l >= SY_MAXLEN) 2186 return SP_FORMERROR; 2187 if (ga_grow(&slang->sl_syl_items, 1) == FAIL) 2188 return SP_OTHERERROR; 2189 syl = ((syl_item_T *)slang->sl_syl_items.ga_data) 2190 + slang->sl_syl_items.ga_len++; 2191 vim_strncpy(syl->sy_chars, s, l); 2192 syl->sy_len = l; 2193 } 2194 return OK; 2195 } 2196 2197 /* 2198 * Count the number of syllables in "word". 2199 * When "word" contains spaces the syllables after the last space are counted. 2200 * Returns zero if syllables are not defines. 2201 */ 2202 static int 2203 count_syllables(slang_T *slang, char_u *word) 2204 { 2205 int cnt = 0; 2206 int skip = FALSE; 2207 char_u *p; 2208 int len; 2209 int i; 2210 syl_item_T *syl; 2211 int c; 2212 2213 if (slang->sl_syllable == NULL) 2214 return 0; 2215 2216 for (p = word; *p != NUL; p += len) 2217 { 2218 /* When running into a space reset counter. */ 2219 if (*p == ' ') 2220 { 2221 len = 1; 2222 cnt = 0; 2223 continue; 2224 } 2225 2226 /* Find longest match of syllable items. */ 2227 len = 0; 2228 for (i = 0; i < slang->sl_syl_items.ga_len; ++i) 2229 { 2230 syl = ((syl_item_T *)slang->sl_syl_items.ga_data) + i; 2231 if (syl->sy_len > len 2232 && STRNCMP(p, syl->sy_chars, syl->sy_len) == 0) 2233 len = syl->sy_len; 2234 } 2235 if (len != 0) /* found a match, count syllable */ 2236 { 2237 ++cnt; 2238 skip = FALSE; 2239 } 2240 else 2241 { 2242 /* No recognized syllable item, at least a syllable char then? */ 2243 c = mb_ptr2char(p); 2244 len = (*mb_ptr2len)(p); 2245 if (vim_strchr(slang->sl_syllable, c) == NULL) 2246 skip = FALSE; /* No, search for next syllable */ 2247 else if (!skip) 2248 { 2249 ++cnt; /* Yes, count it */ 2250 skip = TRUE; /* don't count following syllable chars */ 2251 } 2252 } 2253 } 2254 return cnt; 2255 } 2256 2257 /* 2258 * Parse 'spelllang' and set w_s->b_langp accordingly. 2259 * Returns NULL if it's OK, an error message otherwise. 2260 */ 2261 char * 2262 did_set_spelllang(win_T *wp) 2263 { 2264 garray_T ga; 2265 char_u *splp; 2266 char_u *region; 2267 char_u region_cp[3]; 2268 int filename; 2269 int region_mask; 2270 slang_T *slang; 2271 int c; 2272 char_u lang[MAXWLEN + 1]; 2273 char_u spf_name[MAXPATHL]; 2274 int len; 2275 char_u *p; 2276 int round; 2277 char_u *spf; 2278 char_u *use_region = NULL; 2279 int dont_use_region = FALSE; 2280 int nobreak = FALSE; 2281 int i, j; 2282 langp_T *lp, *lp2; 2283 static int recursive = FALSE; 2284 char *ret_msg = NULL; 2285 char_u *spl_copy; 2286 bufref_T bufref; 2287 2288 set_bufref(&bufref, wp->w_buffer); 2289 2290 /* We don't want to do this recursively. May happen when a language is 2291 * not available and the SpellFileMissing autocommand opens a new buffer 2292 * in which 'spell' is set. */ 2293 if (recursive) 2294 return NULL; 2295 recursive = TRUE; 2296 2297 ga_init2(&ga, sizeof(langp_T), 2); 2298 clear_midword(wp); 2299 2300 /* Make a copy of 'spelllang', the SpellFileMissing autocommands may change 2301 * it under our fingers. */ 2302 spl_copy = vim_strsave(wp->w_s->b_p_spl); 2303 if (spl_copy == NULL) 2304 goto theend; 2305 2306 wp->w_s->b_cjk = 0; 2307 2308 /* Loop over comma separated language names. */ 2309 for (splp = spl_copy; *splp != NUL; ) 2310 { 2311 // Get one language name. 2312 copy_option_part(&splp, lang, MAXWLEN, ","); 2313 region = NULL; 2314 len = (int)STRLEN(lang); 2315 2316 if (!valid_spellang(lang)) 2317 continue; 2318 2319 if (STRCMP(lang, "cjk") == 0) 2320 { 2321 wp->w_s->b_cjk = 1; 2322 continue; 2323 } 2324 2325 /* If the name ends in ".spl" use it as the name of the spell file. 2326 * If there is a region name let "region" point to it and remove it 2327 * from the name. */ 2328 if (len > 4 && fnamecmp(lang + len - 4, ".spl") == 0) 2329 { 2330 filename = TRUE; 2331 2332 /* Locate a region and remove it from the file name. */ 2333 p = vim_strchr(gettail(lang), '_'); 2334 if (p != NULL && ASCII_ISALPHA(p[1]) && ASCII_ISALPHA(p[2]) 2335 && !ASCII_ISALPHA(p[3])) 2336 { 2337 vim_strncpy(region_cp, p + 1, 2); 2338 mch_memmove(p, p + 3, len - (p - lang) - 2); 2339 region = region_cp; 2340 } 2341 else 2342 dont_use_region = TRUE; 2343 2344 /* Check if we loaded this language before. */ 2345 for (slang = first_lang; slang != NULL; slang = slang->sl_next) 2346 if (fullpathcmp(lang, slang->sl_fname, FALSE, TRUE) == FPC_SAME) 2347 break; 2348 } 2349 else 2350 { 2351 filename = FALSE; 2352 if (len > 3 && lang[len - 3] == '_') 2353 { 2354 region = lang + len - 2; 2355 len -= 3; 2356 lang[len] = NUL; 2357 } 2358 else 2359 dont_use_region = TRUE; 2360 2361 /* Check if we loaded this language before. */ 2362 for (slang = first_lang; slang != NULL; slang = slang->sl_next) 2363 if (STRICMP(lang, slang->sl_name) == 0) 2364 break; 2365 } 2366 2367 if (region != NULL) 2368 { 2369 /* If the region differs from what was used before then don't 2370 * use it for 'spellfile'. */ 2371 if (use_region != NULL && STRCMP(region, use_region) != 0) 2372 dont_use_region = TRUE; 2373 use_region = region; 2374 } 2375 2376 /* If not found try loading the language now. */ 2377 if (slang == NULL) 2378 { 2379 if (filename) 2380 (void)spell_load_file(lang, lang, NULL, FALSE); 2381 else 2382 { 2383 spell_load_lang(lang); 2384 /* SpellFileMissing autocommands may do anything, including 2385 * destroying the buffer we are using... */ 2386 if (!bufref_valid(&bufref)) 2387 { 2388 ret_msg = N_("E797: SpellFileMissing autocommand deleted buffer"); 2389 goto theend; 2390 } 2391 } 2392 } 2393 2394 /* 2395 * Loop over the languages, there can be several files for "lang". 2396 */ 2397 for (slang = first_lang; slang != NULL; slang = slang->sl_next) 2398 if (filename ? fullpathcmp(lang, slang->sl_fname, FALSE, TRUE) 2399 == FPC_SAME 2400 : STRICMP(lang, slang->sl_name) == 0) 2401 { 2402 region_mask = REGION_ALL; 2403 if (!filename && region != NULL) 2404 { 2405 /* find region in sl_regions */ 2406 c = find_region(slang->sl_regions, region); 2407 if (c == REGION_ALL) 2408 { 2409 if (slang->sl_add) 2410 { 2411 if (*slang->sl_regions != NUL) 2412 /* This addition file is for other regions. */ 2413 region_mask = 0; 2414 } 2415 else 2416 /* This is probably an error. Give a warning and 2417 * accept the words anyway. */ 2418 smsg(_("Warning: region %s not supported"), 2419 region); 2420 } 2421 else 2422 region_mask = 1 << c; 2423 } 2424 2425 if (region_mask != 0) 2426 { 2427 if (ga_grow(&ga, 1) == FAIL) 2428 { 2429 ga_clear(&ga); 2430 ret_msg = e_outofmem; 2431 goto theend; 2432 } 2433 LANGP_ENTRY(ga, ga.ga_len)->lp_slang = slang; 2434 LANGP_ENTRY(ga, ga.ga_len)->lp_region = region_mask; 2435 ++ga.ga_len; 2436 use_midword(slang, wp); 2437 if (slang->sl_nobreak) 2438 nobreak = TRUE; 2439 } 2440 } 2441 } 2442 2443 /* round 0: load int_wordlist, if possible. 2444 * round 1: load first name in 'spellfile'. 2445 * round 2: load second name in 'spellfile. 2446 * etc. */ 2447 spf = curwin->w_s->b_p_spf; 2448 for (round = 0; round == 0 || *spf != NUL; ++round) 2449 { 2450 if (round == 0) 2451 { 2452 /* Internal wordlist, if there is one. */ 2453 if (int_wordlist == NULL) 2454 continue; 2455 int_wordlist_spl(spf_name); 2456 } 2457 else 2458 { 2459 /* One entry in 'spellfile'. */ 2460 copy_option_part(&spf, spf_name, MAXPATHL - 5, ","); 2461 STRCAT(spf_name, ".spl"); 2462 2463 /* If it was already found above then skip it. */ 2464 for (c = 0; c < ga.ga_len; ++c) 2465 { 2466 p = LANGP_ENTRY(ga, c)->lp_slang->sl_fname; 2467 if (p != NULL && fullpathcmp(spf_name, p, FALSE, TRUE) 2468 == FPC_SAME) 2469 break; 2470 } 2471 if (c < ga.ga_len) 2472 continue; 2473 } 2474 2475 /* Check if it was loaded already. */ 2476 for (slang = first_lang; slang != NULL; slang = slang->sl_next) 2477 if (fullpathcmp(spf_name, slang->sl_fname, FALSE, TRUE) 2478 == FPC_SAME) 2479 break; 2480 if (slang == NULL) 2481 { 2482 /* Not loaded, try loading it now. The language name includes the 2483 * region name, the region is ignored otherwise. for int_wordlist 2484 * use an arbitrary name. */ 2485 if (round == 0) 2486 STRCPY(lang, "internal wordlist"); 2487 else 2488 { 2489 vim_strncpy(lang, gettail(spf_name), MAXWLEN); 2490 p = vim_strchr(lang, '.'); 2491 if (p != NULL) 2492 *p = NUL; /* truncate at ".encoding.add" */ 2493 } 2494 slang = spell_load_file(spf_name, lang, NULL, TRUE); 2495 2496 /* If one of the languages has NOBREAK we assume the addition 2497 * files also have this. */ 2498 if (slang != NULL && nobreak) 2499 slang->sl_nobreak = TRUE; 2500 } 2501 if (slang != NULL && ga_grow(&ga, 1) == OK) 2502 { 2503 region_mask = REGION_ALL; 2504 if (use_region != NULL && !dont_use_region) 2505 { 2506 /* find region in sl_regions */ 2507 c = find_region(slang->sl_regions, use_region); 2508 if (c != REGION_ALL) 2509 region_mask = 1 << c; 2510 else if (*slang->sl_regions != NUL) 2511 /* This spell file is for other regions. */ 2512 region_mask = 0; 2513 } 2514 2515 if (region_mask != 0) 2516 { 2517 LANGP_ENTRY(ga, ga.ga_len)->lp_slang = slang; 2518 LANGP_ENTRY(ga, ga.ga_len)->lp_sallang = NULL; 2519 LANGP_ENTRY(ga, ga.ga_len)->lp_replang = NULL; 2520 LANGP_ENTRY(ga, ga.ga_len)->lp_region = region_mask; 2521 ++ga.ga_len; 2522 use_midword(slang, wp); 2523 } 2524 } 2525 } 2526 2527 /* Everything is fine, store the new b_langp value. */ 2528 ga_clear(&wp->w_s->b_langp); 2529 wp->w_s->b_langp = ga; 2530 2531 /* For each language figure out what language to use for sound folding and 2532 * REP items. If the language doesn't support it itself use another one 2533 * with the same name. E.g. for "en-math" use "en". */ 2534 for (i = 0; i < ga.ga_len; ++i) 2535 { 2536 lp = LANGP_ENTRY(ga, i); 2537 2538 /* sound folding */ 2539 if (lp->lp_slang->sl_sal.ga_len > 0) 2540 /* language does sound folding itself */ 2541 lp->lp_sallang = lp->lp_slang; 2542 else 2543 /* find first similar language that does sound folding */ 2544 for (j = 0; j < ga.ga_len; ++j) 2545 { 2546 lp2 = LANGP_ENTRY(ga, j); 2547 if (lp2->lp_slang->sl_sal.ga_len > 0 2548 && STRNCMP(lp->lp_slang->sl_name, 2549 lp2->lp_slang->sl_name, 2) == 0) 2550 { 2551 lp->lp_sallang = lp2->lp_slang; 2552 break; 2553 } 2554 } 2555 2556 /* REP items */ 2557 if (lp->lp_slang->sl_rep.ga_len > 0) 2558 /* language has REP items itself */ 2559 lp->lp_replang = lp->lp_slang; 2560 else 2561 /* find first similar language that has REP items */ 2562 for (j = 0; j < ga.ga_len; ++j) 2563 { 2564 lp2 = LANGP_ENTRY(ga, j); 2565 if (lp2->lp_slang->sl_rep.ga_len > 0 2566 && STRNCMP(lp->lp_slang->sl_name, 2567 lp2->lp_slang->sl_name, 2) == 0) 2568 { 2569 lp->lp_replang = lp2->lp_slang; 2570 break; 2571 } 2572 } 2573 } 2574 2575 theend: 2576 vim_free(spl_copy); 2577 recursive = FALSE; 2578 redraw_win_later(wp, NOT_VALID); 2579 return ret_msg; 2580 } 2581 2582 /* 2583 * Clear the midword characters for buffer "buf". 2584 */ 2585 static void 2586 clear_midword(win_T *wp) 2587 { 2588 vim_memset(wp->w_s->b_spell_ismw, 0, 256); 2589 VIM_CLEAR(wp->w_s->b_spell_ismw_mb); 2590 } 2591 2592 /* 2593 * Use the "sl_midword" field of language "lp" for buffer "buf". 2594 * They add up to any currently used midword characters. 2595 */ 2596 static void 2597 use_midword(slang_T *lp, win_T *wp) 2598 { 2599 char_u *p; 2600 2601 if (lp->sl_midword == NULL) /* there aren't any */ 2602 return; 2603 2604 for (p = lp->sl_midword; *p != NUL; ) 2605 if (has_mbyte) 2606 { 2607 int c, l, n; 2608 char_u *bp; 2609 2610 c = mb_ptr2char(p); 2611 l = (*mb_ptr2len)(p); 2612 if (c < 256 && l <= 2) 2613 wp->w_s->b_spell_ismw[c] = TRUE; 2614 else if (wp->w_s->b_spell_ismw_mb == NULL) 2615 /* First multi-byte char in "b_spell_ismw_mb". */ 2616 wp->w_s->b_spell_ismw_mb = vim_strnsave(p, l); 2617 else 2618 { 2619 /* Append multi-byte chars to "b_spell_ismw_mb". */ 2620 n = (int)STRLEN(wp->w_s->b_spell_ismw_mb); 2621 bp = vim_strnsave(wp->w_s->b_spell_ismw_mb, n + l); 2622 if (bp != NULL) 2623 { 2624 vim_free(wp->w_s->b_spell_ismw_mb); 2625 wp->w_s->b_spell_ismw_mb = bp; 2626 vim_strncpy(bp + n, p, l); 2627 } 2628 } 2629 p += l; 2630 } 2631 else 2632 wp->w_s->b_spell_ismw[*p++] = TRUE; 2633 } 2634 2635 /* 2636 * Find the region "region[2]" in "rp" (points to "sl_regions"). 2637 * Each region is simply stored as the two characters of its name. 2638 * Returns the index if found (first is 0), REGION_ALL if not found. 2639 */ 2640 static int 2641 find_region(char_u *rp, char_u *region) 2642 { 2643 int i; 2644 2645 for (i = 0; ; i += 2) 2646 { 2647 if (rp[i] == NUL) 2648 return REGION_ALL; 2649 if (rp[i] == region[0] && rp[i + 1] == region[1]) 2650 break; 2651 } 2652 return i / 2; 2653 } 2654 2655 /* 2656 * Return case type of word: 2657 * w word 0 2658 * Word WF_ONECAP 2659 * W WORD WF_ALLCAP 2660 * WoRd wOrd WF_KEEPCAP 2661 */ 2662 int 2663 captype( 2664 char_u *word, 2665 char_u *end) /* When NULL use up to NUL byte. */ 2666 { 2667 char_u *p; 2668 int c; 2669 int firstcap; 2670 int allcap; 2671 int past_second = FALSE; /* past second word char */ 2672 2673 /* find first letter */ 2674 for (p = word; !spell_iswordp_nmw(p, curwin); MB_PTR_ADV(p)) 2675 if (end == NULL ? *p == NUL : p >= end) 2676 return 0; /* only non-word characters, illegal word */ 2677 if (has_mbyte) 2678 c = mb_ptr2char_adv(&p); 2679 else 2680 c = *p++; 2681 firstcap = allcap = SPELL_ISUPPER(c); 2682 2683 /* 2684 * Need to check all letters to find a word with mixed upper/lower. 2685 * But a word with an upper char only at start is a ONECAP. 2686 */ 2687 for ( ; end == NULL ? *p != NUL : p < end; MB_PTR_ADV(p)) 2688 if (spell_iswordp_nmw(p, curwin)) 2689 { 2690 c = PTR2CHAR(p); 2691 if (!SPELL_ISUPPER(c)) 2692 { 2693 /* UUl -> KEEPCAP */ 2694 if (past_second && allcap) 2695 return WF_KEEPCAP; 2696 allcap = FALSE; 2697 } 2698 else if (!allcap) 2699 /* UlU -> KEEPCAP */ 2700 return WF_KEEPCAP; 2701 past_second = TRUE; 2702 } 2703 2704 if (allcap) 2705 return WF_ALLCAP; 2706 if (firstcap) 2707 return WF_ONECAP; 2708 return 0; 2709 } 2710 2711 /* 2712 * Like captype() but for a KEEPCAP word add ONECAP if the word starts with a 2713 * capital. So that make_case_word() can turn WOrd into Word. 2714 * Add ALLCAP for "WOrD". 2715 */ 2716 static int 2717 badword_captype(char_u *word, char_u *end) 2718 { 2719 int flags = captype(word, end); 2720 int c; 2721 int l, u; 2722 int first; 2723 char_u *p; 2724 2725 if (flags & WF_KEEPCAP) 2726 { 2727 /* Count the number of UPPER and lower case letters. */ 2728 l = u = 0; 2729 first = FALSE; 2730 for (p = word; p < end; MB_PTR_ADV(p)) 2731 { 2732 c = PTR2CHAR(p); 2733 if (SPELL_ISUPPER(c)) 2734 { 2735 ++u; 2736 if (p == word) 2737 first = TRUE; 2738 } 2739 else 2740 ++l; 2741 } 2742 2743 /* If there are more UPPER than lower case letters suggest an 2744 * ALLCAP word. Otherwise, if the first letter is UPPER then 2745 * suggest ONECAP. Exception: "ALl" most likely should be "All", 2746 * require three upper case letters. */ 2747 if (u > l && u > 2) 2748 flags |= WF_ALLCAP; 2749 else if (first) 2750 flags |= WF_ONECAP; 2751 2752 if (u >= 2 && l >= 2) /* maCARONI maCAroni */ 2753 flags |= WF_MIXCAP; 2754 } 2755 return flags; 2756 } 2757 2758 /* 2759 * Delete the internal wordlist and its .spl file. 2760 */ 2761 void 2762 spell_delete_wordlist(void) 2763 { 2764 char_u fname[MAXPATHL]; 2765 2766 if (int_wordlist != NULL) 2767 { 2768 mch_remove(int_wordlist); 2769 int_wordlist_spl(fname); 2770 mch_remove(fname); 2771 VIM_CLEAR(int_wordlist); 2772 } 2773 } 2774 2775 /* 2776 * Free all languages. 2777 */ 2778 void 2779 spell_free_all(void) 2780 { 2781 slang_T *slang; 2782 buf_T *buf; 2783 2784 /* Go through all buffers and handle 'spelllang'. <VN> */ 2785 FOR_ALL_BUFFERS(buf) 2786 ga_clear(&buf->b_s.b_langp); 2787 2788 while (first_lang != NULL) 2789 { 2790 slang = first_lang; 2791 first_lang = slang->sl_next; 2792 slang_free(slang); 2793 } 2794 2795 spell_delete_wordlist(); 2796 2797 VIM_CLEAR(repl_to); 2798 VIM_CLEAR(repl_from); 2799 } 2800 2801 /* 2802 * Clear all spelling tables and reload them. 2803 * Used after 'encoding' is set and when ":mkspell" was used. 2804 */ 2805 void 2806 spell_reload(void) 2807 { 2808 win_T *wp; 2809 2810 /* Initialize the table for spell_iswordp(). */ 2811 init_spell_chartab(); 2812 2813 /* Unload all allocated memory. */ 2814 spell_free_all(); 2815 2816 /* Go through all buffers and handle 'spelllang'. */ 2817 FOR_ALL_WINDOWS(wp) 2818 { 2819 /* Only load the wordlists when 'spelllang' is set and there is a 2820 * window for this buffer in which 'spell' is set. */ 2821 if (*wp->w_s->b_p_spl != NUL) 2822 { 2823 if (wp->w_p_spell) 2824 { 2825 (void)did_set_spelllang(wp); 2826 break; 2827 } 2828 } 2829 } 2830 } 2831 2832 /* 2833 * Opposite of offset2bytes(). 2834 * "pp" points to the bytes and is advanced over it. 2835 * Returns the offset. 2836 */ 2837 static int 2838 bytes2offset(char_u **pp) 2839 { 2840 char_u *p = *pp; 2841 int nr; 2842 int c; 2843 2844 c = *p++; 2845 if ((c & 0x80) == 0x00) /* 1 byte */ 2846 { 2847 nr = c - 1; 2848 } 2849 else if ((c & 0xc0) == 0x80) /* 2 bytes */ 2850 { 2851 nr = (c & 0x3f) - 1; 2852 nr = nr * 255 + (*p++ - 1); 2853 } 2854 else if ((c & 0xe0) == 0xc0) /* 3 bytes */ 2855 { 2856 nr = (c & 0x1f) - 1; 2857 nr = nr * 255 + (*p++ - 1); 2858 nr = nr * 255 + (*p++ - 1); 2859 } 2860 else /* 4 bytes */ 2861 { 2862 nr = (c & 0x0f) - 1; 2863 nr = nr * 255 + (*p++ - 1); 2864 nr = nr * 255 + (*p++ - 1); 2865 nr = nr * 255 + (*p++ - 1); 2866 } 2867 2868 *pp = p; 2869 return nr; 2870 } 2871 2872 2873 /* 2874 * Open a spell buffer. This is a nameless buffer that is not in the buffer 2875 * list and only contains text lines. Can use a swapfile to reduce memory 2876 * use. 2877 * Most other fields are invalid! Esp. watch out for string options being 2878 * NULL and there is no undo info. 2879 * Returns NULL when out of memory. 2880 */ 2881 buf_T * 2882 open_spellbuf(void) 2883 { 2884 buf_T *buf; 2885 2886 buf = ALLOC_CLEAR_ONE(buf_T); 2887 if (buf != NULL) 2888 { 2889 buf->b_spell = TRUE; 2890 buf->b_p_swf = TRUE; /* may create a swap file */ 2891 #ifdef FEAT_CRYPT 2892 buf->b_p_key = empty_option; 2893 #endif 2894 ml_open(buf); 2895 ml_open_file(buf); /* create swap file now */ 2896 } 2897 return buf; 2898 } 2899 2900 /* 2901 * Close the buffer used for spell info. 2902 */ 2903 void 2904 close_spellbuf(buf_T *buf) 2905 { 2906 if (buf != NULL) 2907 { 2908 ml_close(buf, TRUE); 2909 vim_free(buf); 2910 } 2911 } 2912 2913 /* 2914 * Init the chartab used for spelling for ASCII. 2915 * EBCDIC is not supported! 2916 */ 2917 void 2918 clear_spell_chartab(spelltab_T *sp) 2919 { 2920 int i; 2921 2922 /* Init everything to FALSE. */ 2923 vim_memset(sp->st_isw, FALSE, sizeof(sp->st_isw)); 2924 vim_memset(sp->st_isu, FALSE, sizeof(sp->st_isu)); 2925 for (i = 0; i < 256; ++i) 2926 { 2927 sp->st_fold[i] = i; 2928 sp->st_upper[i] = i; 2929 } 2930 2931 /* We include digits. A word shouldn't start with a digit, but handling 2932 * that is done separately. */ 2933 for (i = '0'; i <= '9'; ++i) 2934 sp->st_isw[i] = TRUE; 2935 for (i = 'A'; i <= 'Z'; ++i) 2936 { 2937 sp->st_isw[i] = TRUE; 2938 sp->st_isu[i] = TRUE; 2939 sp->st_fold[i] = i + 0x20; 2940 } 2941 for (i = 'a'; i <= 'z'; ++i) 2942 { 2943 sp->st_isw[i] = TRUE; 2944 sp->st_upper[i] = i - 0x20; 2945 } 2946 } 2947 2948 /* 2949 * Init the chartab used for spelling. Only depends on 'encoding'. 2950 * Called once while starting up and when 'encoding' changes. 2951 * The default is to use isalpha(), but the spell file should define the word 2952 * characters to make it possible that 'encoding' differs from the current 2953 * locale. For utf-8 we don't use isalpha() but our own functions. 2954 */ 2955 void 2956 init_spell_chartab(void) 2957 { 2958 int i; 2959 2960 did_set_spelltab = FALSE; 2961 clear_spell_chartab(&spelltab); 2962 if (enc_dbcs) 2963 { 2964 /* DBCS: assume double-wide characters are word characters. */ 2965 for (i = 128; i <= 255; ++i) 2966 if (MB_BYTE2LEN(i) == 2) 2967 spelltab.st_isw[i] = TRUE; 2968 } 2969 else if (enc_utf8) 2970 { 2971 for (i = 128; i < 256; ++i) 2972 { 2973 int f = utf_fold(i); 2974 int u = utf_toupper(i); 2975 2976 spelltab.st_isu[i] = utf_isupper(i); 2977 spelltab.st_isw[i] = spelltab.st_isu[i] || utf_islower(i); 2978 /* The folded/upper-cased value is different between latin1 and 2979 * utf8 for 0xb5, causing E763 for no good reason. Use the latin1 2980 * value for utf-8 to avoid this. */ 2981 spelltab.st_fold[i] = (f < 256) ? f : i; 2982 spelltab.st_upper[i] = (u < 256) ? u : i; 2983 } 2984 } 2985 else 2986 { 2987 /* Rough guess: use locale-dependent library functions. */ 2988 for (i = 128; i < 256; ++i) 2989 { 2990 if (MB_ISUPPER(i)) 2991 { 2992 spelltab.st_isw[i] = TRUE; 2993 spelltab.st_isu[i] = TRUE; 2994 spelltab.st_fold[i] = MB_TOLOWER(i); 2995 } 2996 else if (MB_ISLOWER(i)) 2997 { 2998 spelltab.st_isw[i] = TRUE; 2999 spelltab.st_upper[i] = MB_TOUPPER(i); 3000 } 3001 } 3002 } 3003 } 3004 3005 3006 /* 3007 * Return TRUE if "p" points to a word character. 3008 * As a special case we see "midword" characters as word character when it is 3009 * followed by a word character. This finds they'there but not 'they there'. 3010 * Thus this only works properly when past the first character of the word. 3011 */ 3012 static int 3013 spell_iswordp( 3014 char_u *p, 3015 win_T *wp) /* buffer used */ 3016 { 3017 char_u *s; 3018 int l; 3019 int c; 3020 3021 if (has_mbyte) 3022 { 3023 l = MB_PTR2LEN(p); 3024 s = p; 3025 if (l == 1) 3026 { 3027 /* be quick for ASCII */ 3028 if (wp->w_s->b_spell_ismw[*p]) 3029 s = p + 1; /* skip a mid-word character */ 3030 } 3031 else 3032 { 3033 c = mb_ptr2char(p); 3034 if (c < 256 ? wp->w_s->b_spell_ismw[c] 3035 : (wp->w_s->b_spell_ismw_mb != NULL 3036 && vim_strchr(wp->w_s->b_spell_ismw_mb, c) != NULL)) 3037 s = p + l; 3038 } 3039 3040 c = mb_ptr2char(s); 3041 if (c > 255) 3042 return spell_mb_isword_class(mb_get_class(s), wp); 3043 return spelltab.st_isw[c]; 3044 } 3045 3046 return spelltab.st_isw[wp->w_s->b_spell_ismw[*p] ? p[1] : p[0]]; 3047 } 3048 3049 /* 3050 * Return TRUE if "p" points to a word character. 3051 * Unlike spell_iswordp() this doesn't check for "midword" characters. 3052 */ 3053 int 3054 spell_iswordp_nmw(char_u *p, win_T *wp) 3055 { 3056 int c; 3057 3058 if (has_mbyte) 3059 { 3060 c = mb_ptr2char(p); 3061 if (c > 255) 3062 return spell_mb_isword_class(mb_get_class(p), wp); 3063 return spelltab.st_isw[c]; 3064 } 3065 return spelltab.st_isw[*p]; 3066 } 3067 3068 /* 3069 * Return TRUE if word class indicates a word character. 3070 * Only for characters above 255. 3071 * Unicode subscript and superscript are not considered word characters. 3072 * See also dbcs_class() and utf_class() in mbyte.c. 3073 */ 3074 static int 3075 spell_mb_isword_class(int cl, win_T *wp) 3076 { 3077 if (wp->w_s->b_cjk) 3078 /* East Asian characters are not considered word characters. */ 3079 return cl == 2 || cl == 0x2800; 3080 return cl >= 2 && cl != 0x2070 && cl != 0x2080 && cl != 3; 3081 } 3082 3083 /* 3084 * Return TRUE if "p" points to a word character. 3085 * Wide version of spell_iswordp(). 3086 */ 3087 static int 3088 spell_iswordp_w(int *p, win_T *wp) 3089 { 3090 int *s; 3091 3092 if (*p < 256 ? wp->w_s->b_spell_ismw[*p] 3093 : (wp->w_s->b_spell_ismw_mb != NULL 3094 && vim_strchr(wp->w_s->b_spell_ismw_mb, *p) != NULL)) 3095 s = p + 1; 3096 else 3097 s = p; 3098 3099 if (*s > 255) 3100 { 3101 if (enc_utf8) 3102 return spell_mb_isword_class(utf_class(*s), wp); 3103 if (enc_dbcs) 3104 return spell_mb_isword_class( 3105 dbcs_class((unsigned)*s >> 8, *s & 0xff), wp); 3106 return 0; 3107 } 3108 return spelltab.st_isw[*s]; 3109 } 3110 3111 /* 3112 * Case-fold "str[len]" into "buf[buflen]". The result is NUL terminated. 3113 * Uses the character definitions from the .spl file. 3114 * When using a multi-byte 'encoding' the length may change! 3115 * Returns FAIL when something wrong. 3116 */ 3117 int 3118 spell_casefold( 3119 char_u *str, 3120 int len, 3121 char_u *buf, 3122 int buflen) 3123 { 3124 int i; 3125 3126 if (len >= buflen) 3127 { 3128 buf[0] = NUL; 3129 return FAIL; /* result will not fit */ 3130 } 3131 3132 if (has_mbyte) 3133 { 3134 int outi = 0; 3135 char_u *p; 3136 int c; 3137 3138 /* Fold one character at a time. */ 3139 for (p = str; p < str + len; ) 3140 { 3141 if (outi + MB_MAXBYTES > buflen) 3142 { 3143 buf[outi] = NUL; 3144 return FAIL; 3145 } 3146 c = mb_cptr2char_adv(&p); 3147 outi += mb_char2bytes(SPELL_TOFOLD(c), buf + outi); 3148 } 3149 buf[outi] = NUL; 3150 } 3151 else 3152 { 3153 /* Be quick for non-multibyte encodings. */ 3154 for (i = 0; i < len; ++i) 3155 buf[i] = spelltab.st_fold[str[i]]; 3156 buf[i] = NUL; 3157 } 3158 3159 return OK; 3160 } 3161 3162 /* values for sps_flags */ 3163 #define SPS_BEST 1 3164 #define SPS_FAST 2 3165 #define SPS_DOUBLE 4 3166 3167 static int sps_flags = SPS_BEST; /* flags from 'spellsuggest' */ 3168 static int sps_limit = 9999; /* max nr of suggestions given */ 3169 3170 /* 3171 * Check the 'spellsuggest' option. Return FAIL if it's wrong. 3172 * Sets "sps_flags" and "sps_limit". 3173 */ 3174 int 3175 spell_check_sps(void) 3176 { 3177 char_u *p; 3178 char_u *s; 3179 char_u buf[MAXPATHL]; 3180 int f; 3181 3182 sps_flags = 0; 3183 sps_limit = 9999; 3184 3185 for (p = p_sps; *p != NUL; ) 3186 { 3187 copy_option_part(&p, buf, MAXPATHL, ","); 3188 3189 f = 0; 3190 if (VIM_ISDIGIT(*buf)) 3191 { 3192 s = buf; 3193 sps_limit = getdigits(&s); 3194 if (*s != NUL && !VIM_ISDIGIT(*s)) 3195 f = -1; 3196 } 3197 else if (STRCMP(buf, "best") == 0) 3198 f = SPS_BEST; 3199 else if (STRCMP(buf, "fast") == 0) 3200 f = SPS_FAST; 3201 else if (STRCMP(buf, "double") == 0) 3202 f = SPS_DOUBLE; 3203 else if (STRNCMP(buf, "expr:", 5) != 0 3204 && STRNCMP(buf, "file:", 5) != 0) 3205 f = -1; 3206 3207 if (f == -1 || (sps_flags != 0 && f != 0)) 3208 { 3209 sps_flags = SPS_BEST; 3210 sps_limit = 9999; 3211 return FAIL; 3212 } 3213 if (f != 0) 3214 sps_flags = f; 3215 } 3216 3217 if (sps_flags == 0) 3218 sps_flags = SPS_BEST; 3219 3220 return OK; 3221 } 3222 3223 /* 3224 * "z=": Find badly spelled word under or after the cursor. 3225 * Give suggestions for the properly spelled word. 3226 * In Visual mode use the highlighted word as the bad word. 3227 * When "count" is non-zero use that suggestion. 3228 */ 3229 void 3230 spell_suggest(int count) 3231 { 3232 char_u *line; 3233 pos_T prev_cursor = curwin->w_cursor; 3234 char_u wcopy[MAXWLEN + 2]; 3235 char_u *p; 3236 int i; 3237 int c; 3238 suginfo_T sug; 3239 suggest_T *stp; 3240 int mouse_used; 3241 int need_cap; 3242 int limit; 3243 int selected = count; 3244 int badlen = 0; 3245 int msg_scroll_save = msg_scroll; 3246 3247 if (no_spell_checking(curwin)) 3248 return; 3249 3250 if (VIsual_active) 3251 { 3252 /* Use the Visually selected text as the bad word. But reject 3253 * a multi-line selection. */ 3254 if (curwin->w_cursor.lnum != VIsual.lnum) 3255 { 3256 vim_beep(BO_SPELL); 3257 return; 3258 } 3259 badlen = (int)curwin->w_cursor.col - (int)VIsual.col; 3260 if (badlen < 0) 3261 badlen = -badlen; 3262 else 3263 curwin->w_cursor.col = VIsual.col; 3264 ++badlen; 3265 end_visual_mode(); 3266 } 3267 /* Find the start of the badly spelled word. */ 3268 else if (spell_move_to(curwin, FORWARD, TRUE, TRUE, NULL) == 0 3269 || curwin->w_cursor.col > prev_cursor.col) 3270 { 3271 /* No bad word or it starts after the cursor: use the word under the 3272 * cursor. */ 3273 curwin->w_cursor = prev_cursor; 3274 line = ml_get_curline(); 3275 p = line + curwin->w_cursor.col; 3276 /* Backup to before start of word. */ 3277 while (p > line && spell_iswordp_nmw(p, curwin)) 3278 MB_PTR_BACK(line, p); 3279 /* Forward to start of word. */ 3280 while (*p != NUL && !spell_iswordp_nmw(p, curwin)) 3281 MB_PTR_ADV(p); 3282 3283 if (!spell_iswordp_nmw(p, curwin)) /* No word found. */ 3284 { 3285 beep_flush(); 3286 return; 3287 } 3288 curwin->w_cursor.col = (colnr_T)(p - line); 3289 } 3290 3291 /* Get the word and its length. */ 3292 3293 /* Figure out if the word should be capitalised. */ 3294 need_cap = check_need_cap(curwin->w_cursor.lnum, curwin->w_cursor.col); 3295 3296 /* Make a copy of current line since autocommands may free the line. */ 3297 line = vim_strsave(ml_get_curline()); 3298 if (line == NULL) 3299 goto skip; 3300 3301 /* Get the list of suggestions. Limit to 'lines' - 2 or the number in 3302 * 'spellsuggest', whatever is smaller. */ 3303 if (sps_limit > (int)Rows - 2) 3304 limit = (int)Rows - 2; 3305 else 3306 limit = sps_limit; 3307 spell_find_suggest(line + curwin->w_cursor.col, badlen, &sug, limit, 3308 TRUE, need_cap, TRUE); 3309 3310 if (sug.su_ga.ga_len == 0) 3311 msg(_("Sorry, no suggestions")); 3312 else if (count > 0) 3313 { 3314 if (count > sug.su_ga.ga_len) 3315 smsg(_("Sorry, only %ld suggestions"), 3316 (long)sug.su_ga.ga_len); 3317 } 3318 else 3319 { 3320 VIM_CLEAR(repl_from); 3321 VIM_CLEAR(repl_to); 3322 3323 #ifdef FEAT_RIGHTLEFT 3324 /* When 'rightleft' is set the list is drawn right-left. */ 3325 cmdmsg_rl = curwin->w_p_rl; 3326 if (cmdmsg_rl) 3327 msg_col = Columns - 1; 3328 #endif 3329 3330 /* List the suggestions. */ 3331 msg_start(); 3332 msg_row = Rows - 1; /* for when 'cmdheight' > 1 */ 3333 lines_left = Rows; /* avoid more prompt */ 3334 vim_snprintf((char *)IObuff, IOSIZE, _("Change \"%.*s\" to:"), 3335 sug.su_badlen, sug.su_badptr); 3336 #ifdef FEAT_RIGHTLEFT 3337 if (cmdmsg_rl && STRNCMP(IObuff, "Change", 6) == 0) 3338 { 3339 /* And now the rabbit from the high hat: Avoid showing the 3340 * untranslated message rightleft. */ 3341 vim_snprintf((char *)IObuff, IOSIZE, ":ot \"%.*s\" egnahC", 3342 sug.su_badlen, sug.su_badptr); 3343 } 3344 #endif 3345 msg_puts((char *)IObuff); 3346 msg_clr_eos(); 3347 msg_putchar('\n'); 3348 3349 msg_scroll = TRUE; 3350 for (i = 0; i < sug.su_ga.ga_len; ++i) 3351 { 3352 stp = &SUG(sug.su_ga, i); 3353 3354 /* The suggested word may replace only part of the bad word, add 3355 * the not replaced part. */ 3356 vim_strncpy(wcopy, stp->st_word, MAXWLEN); 3357 if (sug.su_badlen > stp->st_orglen) 3358 vim_strncpy(wcopy + stp->st_wordlen, 3359 sug.su_badptr + stp->st_orglen, 3360 sug.su_badlen - stp->st_orglen); 3361 vim_snprintf((char *)IObuff, IOSIZE, "%2d", i + 1); 3362 #ifdef FEAT_RIGHTLEFT 3363 if (cmdmsg_rl) 3364 rl_mirror(IObuff); 3365 #endif 3366 msg_puts((char *)IObuff); 3367 3368 vim_snprintf((char *)IObuff, IOSIZE, " \"%s\"", wcopy); 3369 msg_puts((char *)IObuff); 3370 3371 /* The word may replace more than "su_badlen". */ 3372 if (sug.su_badlen < stp->st_orglen) 3373 { 3374 vim_snprintf((char *)IObuff, IOSIZE, _(" < \"%.*s\""), 3375 stp->st_orglen, sug.su_badptr); 3376 msg_puts((char *)IObuff); 3377 } 3378 3379 if (p_verbose > 0) 3380 { 3381 /* Add the score. */ 3382 if (sps_flags & (SPS_DOUBLE | SPS_BEST)) 3383 vim_snprintf((char *)IObuff, IOSIZE, " (%s%d - %d)", 3384 stp->st_salscore ? "s " : "", 3385 stp->st_score, stp->st_altscore); 3386 else 3387 vim_snprintf((char *)IObuff, IOSIZE, " (%d)", 3388 stp->st_score); 3389 #ifdef FEAT_RIGHTLEFT 3390 if (cmdmsg_rl) 3391 /* Mirror the numbers, but keep the leading space. */ 3392 rl_mirror(IObuff + 1); 3393 #endif 3394 msg_advance(30); 3395 msg_puts((char *)IObuff); 3396 } 3397 msg_putchar('\n'); 3398 } 3399 3400 #ifdef FEAT_RIGHTLEFT 3401 cmdmsg_rl = FALSE; 3402 msg_col = 0; 3403 #endif 3404 /* Ask for choice. */ 3405 selected = prompt_for_number(&mouse_used); 3406 if (mouse_used) 3407 selected -= lines_left; 3408 lines_left = Rows; /* avoid more prompt */ 3409 /* don't delay for 'smd' in normal_cmd() */ 3410 msg_scroll = msg_scroll_save; 3411 } 3412 3413 if (selected > 0 && selected <= sug.su_ga.ga_len && u_save_cursor() == OK) 3414 { 3415 /* Save the from and to text for :spellrepall. */ 3416 stp = &SUG(sug.su_ga, selected - 1); 3417 if (sug.su_badlen > stp->st_orglen) 3418 { 3419 /* Replacing less than "su_badlen", append the remainder to 3420 * repl_to. */ 3421 repl_from = vim_strnsave(sug.su_badptr, sug.su_badlen); 3422 vim_snprintf((char *)IObuff, IOSIZE, "%s%.*s", stp->st_word, 3423 sug.su_badlen - stp->st_orglen, 3424 sug.su_badptr + stp->st_orglen); 3425 repl_to = vim_strsave(IObuff); 3426 } 3427 else 3428 { 3429 /* Replacing su_badlen or more, use the whole word. */ 3430 repl_from = vim_strnsave(sug.su_badptr, stp->st_orglen); 3431 repl_to = vim_strsave(stp->st_word); 3432 } 3433 3434 /* Replace the word. */ 3435 p = alloc(STRLEN(line) - stp->st_orglen + stp->st_wordlen + 1); 3436 if (p != NULL) 3437 { 3438 c = (int)(sug.su_badptr - line); 3439 mch_memmove(p, line, c); 3440 STRCPY(p + c, stp->st_word); 3441 STRCAT(p, sug.su_badptr + stp->st_orglen); 3442 ml_replace(curwin->w_cursor.lnum, p, FALSE); 3443 curwin->w_cursor.col = c; 3444 3445 /* For redo we use a change-word command. */ 3446 ResetRedobuff(); 3447 AppendToRedobuff((char_u *)"ciw"); 3448 AppendToRedobuffLit(p + c, 3449 stp->st_wordlen + sug.su_badlen - stp->st_orglen); 3450 AppendCharToRedobuff(ESC); 3451 3452 /* After this "p" may be invalid. */ 3453 changed_bytes(curwin->w_cursor.lnum, c); 3454 } 3455 } 3456 else 3457 curwin->w_cursor = prev_cursor; 3458 3459 spell_find_cleanup(&sug); 3460 skip: 3461 vim_free(line); 3462 } 3463 3464 /* 3465 * Check if the word at line "lnum" column "col" is required to start with a 3466 * capital. This uses 'spellcapcheck' of the current buffer. 3467 */ 3468 static int 3469 check_need_cap(linenr_T lnum, colnr_T col) 3470 { 3471 int need_cap = FALSE; 3472 char_u *line; 3473 char_u *line_copy = NULL; 3474 char_u *p; 3475 colnr_T endcol; 3476 regmatch_T regmatch; 3477 3478 if (curwin->w_s->b_cap_prog == NULL) 3479 return FALSE; 3480 3481 line = ml_get_curline(); 3482 endcol = 0; 3483 if (getwhitecols(line) >= (int)col) 3484 { 3485 /* At start of line, check if previous line is empty or sentence 3486 * ends there. */ 3487 if (lnum == 1) 3488 need_cap = TRUE; 3489 else 3490 { 3491 line = ml_get(lnum - 1); 3492 if (*skipwhite(line) == NUL) 3493 need_cap = TRUE; 3494 else 3495 { 3496 /* Append a space in place of the line break. */ 3497 line_copy = concat_str(line, (char_u *)" "); 3498 line = line_copy; 3499 endcol = (colnr_T)STRLEN(line); 3500 } 3501 } 3502 } 3503 else 3504 endcol = col; 3505 3506 if (endcol > 0) 3507 { 3508 /* Check if sentence ends before the bad word. */ 3509 regmatch.regprog = curwin->w_s->b_cap_prog; 3510 regmatch.rm_ic = FALSE; 3511 p = line + endcol; 3512 for (;;) 3513 { 3514 MB_PTR_BACK(line, p); 3515 if (p == line || spell_iswordp_nmw(p, curwin)) 3516 break; 3517 if (vim_regexec(®match, p, 0) 3518 && regmatch.endp[0] == line + endcol) 3519 { 3520 need_cap = TRUE; 3521 break; 3522 } 3523 } 3524 curwin->w_s->b_cap_prog = regmatch.regprog; 3525 } 3526 3527 vim_free(line_copy); 3528 3529 return need_cap; 3530 } 3531 3532 3533 /* 3534 * ":spellrepall" 3535 */ 3536 void 3537 ex_spellrepall(exarg_T *eap UNUSED) 3538 { 3539 pos_T pos = curwin->w_cursor; 3540 char_u *frompat; 3541 int addlen; 3542 char_u *line; 3543 char_u *p; 3544 int save_ws = p_ws; 3545 linenr_T prev_lnum = 0; 3546 3547 if (repl_from == NULL || repl_to == NULL) 3548 { 3549 emsg(_("E752: No previous spell replacement")); 3550 return; 3551 } 3552 addlen = (int)(STRLEN(repl_to) - STRLEN(repl_from)); 3553 3554 frompat = alloc(STRLEN(repl_from) + 7); 3555 if (frompat == NULL) 3556 return; 3557 sprintf((char *)frompat, "\\V\\<%s\\>", repl_from); 3558 p_ws = FALSE; 3559 3560 sub_nsubs = 0; 3561 sub_nlines = 0; 3562 curwin->w_cursor.lnum = 0; 3563 while (!got_int) 3564 { 3565 if (do_search(NULL, '/', frompat, 1L, SEARCH_KEEP, NULL, NULL) == 0 3566 || u_save_cursor() == FAIL) 3567 break; 3568 3569 /* Only replace when the right word isn't there yet. This happens 3570 * when changing "etc" to "etc.". */ 3571 line = ml_get_curline(); 3572 if (addlen <= 0 || STRNCMP(line + curwin->w_cursor.col, 3573 repl_to, STRLEN(repl_to)) != 0) 3574 { 3575 p = alloc(STRLEN(line) + addlen + 1); 3576 if (p == NULL) 3577 break; 3578 mch_memmove(p, line, curwin->w_cursor.col); 3579 STRCPY(p + curwin->w_cursor.col, repl_to); 3580 STRCAT(p, line + curwin->w_cursor.col + STRLEN(repl_from)); 3581 ml_replace(curwin->w_cursor.lnum, p, FALSE); 3582 changed_bytes(curwin->w_cursor.lnum, curwin->w_cursor.col); 3583 3584 if (curwin->w_cursor.lnum != prev_lnum) 3585 { 3586 ++sub_nlines; 3587 prev_lnum = curwin->w_cursor.lnum; 3588 } 3589 ++sub_nsubs; 3590 } 3591 curwin->w_cursor.col += (colnr_T)STRLEN(repl_to); 3592 } 3593 3594 p_ws = save_ws; 3595 curwin->w_cursor = pos; 3596 vim_free(frompat); 3597 3598 if (sub_nsubs == 0) 3599 semsg(_("E753: Not found: %s"), repl_from); 3600 else 3601 do_sub_msg(FALSE); 3602 } 3603 3604 /* 3605 * Find spell suggestions for "word". Return them in the growarray "*gap" as 3606 * a list of allocated strings. 3607 */ 3608 void 3609 spell_suggest_list( 3610 garray_T *gap, 3611 char_u *word, 3612 int maxcount, /* maximum nr of suggestions */ 3613 int need_cap, /* 'spellcapcheck' matched */ 3614 int interactive) 3615 { 3616 suginfo_T sug; 3617 int i; 3618 suggest_T *stp; 3619 char_u *wcopy; 3620 3621 spell_find_suggest(word, 0, &sug, maxcount, FALSE, need_cap, interactive); 3622 3623 /* Make room in "gap". */ 3624 ga_init2(gap, sizeof(char_u *), sug.su_ga.ga_len + 1); 3625 if (ga_grow(gap, sug.su_ga.ga_len) == OK) 3626 { 3627 for (i = 0; i < sug.su_ga.ga_len; ++i) 3628 { 3629 stp = &SUG(sug.su_ga, i); 3630 3631 /* The suggested word may replace only part of "word", add the not 3632 * replaced part. */ 3633 wcopy = alloc(stp->st_wordlen 3634 + (unsigned)STRLEN(sug.su_badptr + stp->st_orglen) + 1); 3635 if (wcopy == NULL) 3636 break; 3637 STRCPY(wcopy, stp->st_word); 3638 STRCPY(wcopy + stp->st_wordlen, sug.su_badptr + stp->st_orglen); 3639 ((char_u **)gap->ga_data)[gap->ga_len++] = wcopy; 3640 } 3641 } 3642 3643 spell_find_cleanup(&sug); 3644 } 3645 3646 /* 3647 * Find spell suggestions for the word at the start of "badptr". 3648 * Return the suggestions in "su->su_ga". 3649 * The maximum number of suggestions is "maxcount". 3650 * Note: does use info for the current window. 3651 * This is based on the mechanisms of Aspell, but completely reimplemented. 3652 */ 3653 static void 3654 spell_find_suggest( 3655 char_u *badptr, 3656 int badlen, /* length of bad word or 0 if unknown */ 3657 suginfo_T *su, 3658 int maxcount, 3659 int banbadword, /* don't include badword in suggestions */ 3660 int need_cap, /* word should start with capital */ 3661 int interactive) 3662 { 3663 hlf_T attr = HLF_COUNT; 3664 char_u buf[MAXPATHL]; 3665 char_u *p; 3666 int do_combine = FALSE; 3667 char_u *sps_copy; 3668 #ifdef FEAT_EVAL 3669 static int expr_busy = FALSE; 3670 #endif 3671 int c; 3672 int i; 3673 langp_T *lp; 3674 3675 /* 3676 * Set the info in "*su". 3677 */ 3678 vim_memset(su, 0, sizeof(suginfo_T)); 3679 ga_init2(&su->su_ga, (int)sizeof(suggest_T), 10); 3680 ga_init2(&su->su_sga, (int)sizeof(suggest_T), 10); 3681 if (*badptr == NUL) 3682 return; 3683 hash_init(&su->su_banned); 3684 3685 su->su_badptr = badptr; 3686 if (badlen != 0) 3687 su->su_badlen = badlen; 3688 else 3689 su->su_badlen = spell_check(curwin, su->su_badptr, &attr, NULL, FALSE); 3690 su->su_maxcount = maxcount; 3691 su->su_maxscore = SCORE_MAXINIT; 3692 3693 if (su->su_badlen >= MAXWLEN) 3694 su->su_badlen = MAXWLEN - 1; /* just in case */ 3695 vim_strncpy(su->su_badword, su->su_badptr, su->su_badlen); 3696 (void)spell_casefold(su->su_badptr, su->su_badlen, 3697 su->su_fbadword, MAXWLEN); 3698 /* TODO: make this work if the case-folded text is longer than the original 3699 * text. Currently an illegal byte causes wrong pointer computations. */ 3700 su->su_fbadword[su->su_badlen] = NUL; 3701 3702 /* get caps flags for bad word */ 3703 su->su_badflags = badword_captype(su->su_badptr, 3704 su->su_badptr + su->su_badlen); 3705 if (need_cap) 3706 su->su_badflags |= WF_ONECAP; 3707 3708 /* Find the default language for sound folding. We simply use the first 3709 * one in 'spelllang' that supports sound folding. That's good for when 3710 * using multiple files for one language, it's not that bad when mixing 3711 * languages (e.g., "pl,en"). */ 3712 for (i = 0; i < curbuf->b_s.b_langp.ga_len; ++i) 3713 { 3714 lp = LANGP_ENTRY(curbuf->b_s.b_langp, i); 3715 if (lp->lp_sallang != NULL) 3716 { 3717 su->su_sallang = lp->lp_sallang; 3718 break; 3719 } 3720 } 3721 3722 /* Soundfold the bad word with the default sound folding, so that we don't 3723 * have to do this many times. */ 3724 if (su->su_sallang != NULL) 3725 spell_soundfold(su->su_sallang, su->su_fbadword, TRUE, 3726 su->su_sal_badword); 3727 3728 /* If the word is not capitalised and spell_check() doesn't consider the 3729 * word to be bad then it might need to be capitalised. Add a suggestion 3730 * for that. */ 3731 c = PTR2CHAR(su->su_badptr); 3732 if (!SPELL_ISUPPER(c) && attr == HLF_COUNT) 3733 { 3734 make_case_word(su->su_badword, buf, WF_ONECAP); 3735 add_suggestion(su, &su->su_ga, buf, su->su_badlen, SCORE_ICASE, 3736 0, TRUE, su->su_sallang, FALSE); 3737 } 3738 3739 /* Ban the bad word itself. It may appear in another region. */ 3740 if (banbadword) 3741 add_banned(su, su->su_badword); 3742 3743 /* Make a copy of 'spellsuggest', because the expression may change it. */ 3744 sps_copy = vim_strsave(p_sps); 3745 if (sps_copy == NULL) 3746 return; 3747 3748 /* Loop over the items in 'spellsuggest'. */ 3749 for (p = sps_copy; *p != NUL; ) 3750 { 3751 copy_option_part(&p, buf, MAXPATHL, ","); 3752 3753 if (STRNCMP(buf, "expr:", 5) == 0) 3754 { 3755 #ifdef FEAT_EVAL 3756 /* Evaluate an expression. Skip this when called recursively, 3757 * when using spellsuggest() in the expression. */ 3758 if (!expr_busy) 3759 { 3760 expr_busy = TRUE; 3761 spell_suggest_expr(su, buf + 5); 3762 expr_busy = FALSE; 3763 } 3764 #endif 3765 } 3766 else if (STRNCMP(buf, "file:", 5) == 0) 3767 /* Use list of suggestions in a file. */ 3768 spell_suggest_file(su, buf + 5); 3769 else 3770 { 3771 /* Use internal method. */ 3772 spell_suggest_intern(su, interactive); 3773 if (sps_flags & SPS_DOUBLE) 3774 do_combine = TRUE; 3775 } 3776 } 3777 3778 vim_free(sps_copy); 3779 3780 if (do_combine) 3781 /* Combine the two list of suggestions. This must be done last, 3782 * because sorting changes the order again. */ 3783 score_combine(su); 3784 } 3785 3786 #ifdef FEAT_EVAL 3787 /* 3788 * Find suggestions by evaluating expression "expr". 3789 */ 3790 static void 3791 spell_suggest_expr(suginfo_T *su, char_u *expr) 3792 { 3793 list_T *list; 3794 listitem_T *li; 3795 int score; 3796 char_u *p; 3797 3798 /* The work is split up in a few parts to avoid having to export 3799 * suginfo_T. 3800 * First evaluate the expression and get the resulting list. */ 3801 list = eval_spell_expr(su->su_badword, expr); 3802 if (list != NULL) 3803 { 3804 /* Loop over the items in the list. */ 3805 for (li = list->lv_first; li != NULL; li = li->li_next) 3806 if (li->li_tv.v_type == VAR_LIST) 3807 { 3808 /* Get the word and the score from the items. */ 3809 score = get_spellword(li->li_tv.vval.v_list, &p); 3810 if (score >= 0 && score <= su->su_maxscore) 3811 add_suggestion(su, &su->su_ga, p, su->su_badlen, 3812 score, 0, TRUE, su->su_sallang, FALSE); 3813 } 3814 list_unref(list); 3815 } 3816 3817 /* Remove bogus suggestions, sort and truncate at "maxcount". */ 3818 check_suggestions(su, &su->su_ga); 3819 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); 3820 } 3821 #endif 3822 3823 /* 3824 * Find suggestions in file "fname". Used for "file:" in 'spellsuggest'. 3825 */ 3826 static void 3827 spell_suggest_file(suginfo_T *su, char_u *fname) 3828 { 3829 FILE *fd; 3830 char_u line[MAXWLEN * 2]; 3831 char_u *p; 3832 int len; 3833 char_u cword[MAXWLEN]; 3834 3835 /* Open the file. */ 3836 fd = mch_fopen((char *)fname, "r"); 3837 if (fd == NULL) 3838 { 3839 semsg(_(e_notopen), fname); 3840 return; 3841 } 3842 3843 /* Read it line by line. */ 3844 while (!vim_fgets(line, MAXWLEN * 2, fd) && !got_int) 3845 { 3846 line_breakcheck(); 3847 3848 p = vim_strchr(line, '/'); 3849 if (p == NULL) 3850 continue; /* No Tab found, just skip the line. */ 3851 *p++ = NUL; 3852 if (STRICMP(su->su_badword, line) == 0) 3853 { 3854 /* Match! Isolate the good word, until CR or NL. */ 3855 for (len = 0; p[len] >= ' '; ++len) 3856 ; 3857 p[len] = NUL; 3858 3859 /* If the suggestion doesn't have specific case duplicate the case 3860 * of the bad word. */ 3861 if (captype(p, NULL) == 0) 3862 { 3863 make_case_word(p, cword, su->su_badflags); 3864 p = cword; 3865 } 3866 3867 add_suggestion(su, &su->su_ga, p, su->su_badlen, 3868 SCORE_FILE, 0, TRUE, su->su_sallang, FALSE); 3869 } 3870 } 3871 3872 fclose(fd); 3873 3874 /* Remove bogus suggestions, sort and truncate at "maxcount". */ 3875 check_suggestions(su, &su->su_ga); 3876 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); 3877 } 3878 3879 /* 3880 * Find suggestions for the internal method indicated by "sps_flags". 3881 */ 3882 static void 3883 spell_suggest_intern(suginfo_T *su, int interactive) 3884 { 3885 /* 3886 * Load the .sug file(s) that are available and not done yet. 3887 */ 3888 suggest_load_files(); 3889 3890 /* 3891 * 1. Try special cases, such as repeating a word: "the the" -> "the". 3892 * 3893 * Set a maximum score to limit the combination of operations that is 3894 * tried. 3895 */ 3896 suggest_try_special(su); 3897 3898 /* 3899 * 2. Try inserting/deleting/swapping/changing a letter, use REP entries 3900 * from the .aff file and inserting a space (split the word). 3901 */ 3902 suggest_try_change(su); 3903 3904 /* For the resulting top-scorers compute the sound-a-like score. */ 3905 if (sps_flags & SPS_DOUBLE) 3906 score_comp_sal(su); 3907 3908 /* 3909 * 3. Try finding sound-a-like words. 3910 */ 3911 if ((sps_flags & SPS_FAST) == 0) 3912 { 3913 if (sps_flags & SPS_BEST) 3914 /* Adjust the word score for the suggestions found so far for how 3915 * they sounds like. */ 3916 rescore_suggestions(su); 3917 3918 /* 3919 * While going through the soundfold tree "su_maxscore" is the score 3920 * for the soundfold word, limits the changes that are being tried, 3921 * and "su_sfmaxscore" the rescored score, which is set by 3922 * cleanup_suggestions(). 3923 * First find words with a small edit distance, because this is much 3924 * faster and often already finds the top-N suggestions. If we didn't 3925 * find many suggestions try again with a higher edit distance. 3926 * "sl_sounddone" is used to avoid doing the same word twice. 3927 */ 3928 suggest_try_soundalike_prep(); 3929 su->su_maxscore = SCORE_SFMAX1; 3930 su->su_sfmaxscore = SCORE_MAXINIT * 3; 3931 suggest_try_soundalike(su); 3932 if (su->su_ga.ga_len < SUG_CLEAN_COUNT(su)) 3933 { 3934 /* We didn't find enough matches, try again, allowing more 3935 * changes to the soundfold word. */ 3936 su->su_maxscore = SCORE_SFMAX2; 3937 suggest_try_soundalike(su); 3938 if (su->su_ga.ga_len < SUG_CLEAN_COUNT(su)) 3939 { 3940 /* Still didn't find enough matches, try again, allowing even 3941 * more changes to the soundfold word. */ 3942 su->su_maxscore = SCORE_SFMAX3; 3943 suggest_try_soundalike(su); 3944 } 3945 } 3946 su->su_maxscore = su->su_sfmaxscore; 3947 suggest_try_soundalike_finish(); 3948 } 3949 3950 /* When CTRL-C was hit while searching do show the results. Only clear 3951 * got_int when using a command, not for spellsuggest(). */ 3952 ui_breakcheck(); 3953 if (interactive && got_int) 3954 { 3955 (void)vgetc(); 3956 got_int = FALSE; 3957 } 3958 3959 if ((sps_flags & SPS_DOUBLE) == 0 && su->su_ga.ga_len != 0) 3960 { 3961 if (sps_flags & SPS_BEST) 3962 /* Adjust the word score for how it sounds like. */ 3963 rescore_suggestions(su); 3964 3965 /* Remove bogus suggestions, sort and truncate at "maxcount". */ 3966 check_suggestions(su, &su->su_ga); 3967 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); 3968 } 3969 } 3970 3971 /* 3972 * Free the info put in "*su" by spell_find_suggest(). 3973 */ 3974 static void 3975 spell_find_cleanup(suginfo_T *su) 3976 { 3977 int i; 3978 3979 /* Free the suggestions. */ 3980 for (i = 0; i < su->su_ga.ga_len; ++i) 3981 vim_free(SUG(su->su_ga, i).st_word); 3982 ga_clear(&su->su_ga); 3983 for (i = 0; i < su->su_sga.ga_len; ++i) 3984 vim_free(SUG(su->su_sga, i).st_word); 3985 ga_clear(&su->su_sga); 3986 3987 /* Free the banned words. */ 3988 hash_clear_all(&su->su_banned, 0); 3989 } 3990 3991 /* 3992 * Make a copy of "word", with the first letter upper or lower cased, to 3993 * "wcopy[MAXWLEN]". "word" must not be empty. 3994 * The result is NUL terminated. 3995 */ 3996 void 3997 onecap_copy( 3998 char_u *word, 3999 char_u *wcopy, 4000 int upper) /* TRUE: first letter made upper case */ 4001 { 4002 char_u *p; 4003 int c; 4004 int l; 4005 4006 p = word; 4007 if (has_mbyte) 4008 c = mb_cptr2char_adv(&p); 4009 else 4010 c = *p++; 4011 if (upper) 4012 c = SPELL_TOUPPER(c); 4013 else 4014 c = SPELL_TOFOLD(c); 4015 if (has_mbyte) 4016 l = mb_char2bytes(c, wcopy); 4017 else 4018 { 4019 l = 1; 4020 wcopy[0] = c; 4021 } 4022 vim_strncpy(wcopy + l, p, MAXWLEN - l - 1); 4023 } 4024 4025 /* 4026 * Make a copy of "word" with all the letters upper cased into 4027 * "wcopy[MAXWLEN]". The result is NUL terminated. 4028 */ 4029 static void 4030 allcap_copy(char_u *word, char_u *wcopy) 4031 { 4032 char_u *s; 4033 char_u *d; 4034 int c; 4035 4036 d = wcopy; 4037 for (s = word; *s != NUL; ) 4038 { 4039 if (has_mbyte) 4040 c = mb_cptr2char_adv(&s); 4041 else 4042 c = *s++; 4043 4044 /* We only change 0xdf to SS when we are certain latin1 is used. It 4045 * would cause weird errors in other 8-bit encodings. */ 4046 if (enc_latin1like && c == 0xdf) 4047 { 4048 c = 'S'; 4049 if (d - wcopy >= MAXWLEN - 1) 4050 break; 4051 *d++ = c; 4052 } 4053 else 4054 c = SPELL_TOUPPER(c); 4055 4056 if (has_mbyte) 4057 { 4058 if (d - wcopy >= MAXWLEN - MB_MAXBYTES) 4059 break; 4060 d += mb_char2bytes(c, d); 4061 } 4062 else 4063 { 4064 if (d - wcopy >= MAXWLEN - 1) 4065 break; 4066 *d++ = c; 4067 } 4068 } 4069 *d = NUL; 4070 } 4071 4072 /* 4073 * Try finding suggestions by recognizing specific situations. 4074 */ 4075 static void 4076 suggest_try_special(suginfo_T *su) 4077 { 4078 char_u *p; 4079 size_t len; 4080 int c; 4081 char_u word[MAXWLEN]; 4082 4083 /* 4084 * Recognize a word that is repeated: "the the". 4085 */ 4086 p = skiptowhite(su->su_fbadword); 4087 len = p - su->su_fbadword; 4088 p = skipwhite(p); 4089 if (STRLEN(p) == len && STRNCMP(su->su_fbadword, p, len) == 0) 4090 { 4091 /* Include badflags: if the badword is onecap or allcap 4092 * use that for the goodword too: "The the" -> "The". */ 4093 c = su->su_fbadword[len]; 4094 su->su_fbadword[len] = NUL; 4095 make_case_word(su->su_fbadword, word, su->su_badflags); 4096 su->su_fbadword[len] = c; 4097 4098 /* Give a soundalike score of 0, compute the score as if deleting one 4099 * character. */ 4100 add_suggestion(su, &su->su_ga, word, su->su_badlen, 4101 RESCORE(SCORE_REP, 0), 0, TRUE, su->su_sallang, FALSE); 4102 } 4103 } 4104 4105 /* 4106 * Change the 0 to 1 to measure how much time is spent in each state. 4107 * Output is dumped in "suggestprof". 4108 */ 4109 #if 0 4110 # define SUGGEST_PROFILE 4111 proftime_T current; 4112 proftime_T total; 4113 proftime_T times[STATE_FINAL + 1]; 4114 long counts[STATE_FINAL + 1]; 4115 4116 static void 4117 prof_init(void) 4118 { 4119 for (int i = 0; i <= STATE_FINAL; ++i) 4120 { 4121 profile_zero(×[i]); 4122 counts[i] = 0; 4123 } 4124 profile_start(¤t); 4125 profile_start(&total); 4126 } 4127 4128 /* call before changing state */ 4129 static void 4130 prof_store(state_T state) 4131 { 4132 profile_end(¤t); 4133 profile_add(×[state], ¤t); 4134 ++counts[state]; 4135 profile_start(¤t); 4136 } 4137 # define PROF_STORE(state) prof_store(state); 4138 4139 static void 4140 prof_report(char *name) 4141 { 4142 FILE *fd = fopen("suggestprof", "a"); 4143 4144 profile_end(&total); 4145 fprintf(fd, "-----------------------\n"); 4146 fprintf(fd, "%s: %s\n", name, profile_msg(&total)); 4147 for (int i = 0; i <= STATE_FINAL; ++i) 4148 fprintf(fd, "%d: %s (%ld)\n", i, profile_msg(×[i]), counts[i]); 4149 fclose(fd); 4150 } 4151 #else 4152 # define PROF_STORE(state) 4153 #endif 4154 4155 /* 4156 * Try finding suggestions by adding/removing/swapping letters. 4157 */ 4158 static void 4159 suggest_try_change(suginfo_T *su) 4160 { 4161 char_u fword[MAXWLEN]; /* copy of the bad word, case-folded */ 4162 int n; 4163 char_u *p; 4164 int lpi; 4165 langp_T *lp; 4166 4167 /* We make a copy of the case-folded bad word, so that we can modify it 4168 * to find matches (esp. REP items). Append some more text, changing 4169 * chars after the bad word may help. */ 4170 STRCPY(fword, su->su_fbadword); 4171 n = (int)STRLEN(fword); 4172 p = su->su_badptr + su->su_badlen; 4173 (void)spell_casefold(p, (int)STRLEN(p), fword + n, MAXWLEN - n); 4174 4175 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 4176 { 4177 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 4178 4179 /* If reloading a spell file fails it's still in the list but 4180 * everything has been cleared. */ 4181 if (lp->lp_slang->sl_fbyts == NULL) 4182 continue; 4183 4184 /* Try it for this language. Will add possible suggestions. */ 4185 #ifdef SUGGEST_PROFILE 4186 prof_init(); 4187 #endif 4188 suggest_trie_walk(su, lp, fword, FALSE); 4189 #ifdef SUGGEST_PROFILE 4190 prof_report("try_change"); 4191 #endif 4192 } 4193 } 4194 4195 /* Check the maximum score, if we go over it we won't try this change. */ 4196 #define TRY_DEEPER(su, stack, depth, add) \ 4197 (stack[depth].ts_score + (add) < su->su_maxscore) 4198 4199 /* 4200 * Try finding suggestions by adding/removing/swapping letters. 4201 * 4202 * This uses a state machine. At each node in the tree we try various 4203 * operations. When trying if an operation works "depth" is increased and the 4204 * stack[] is used to store info. This allows combinations, thus insert one 4205 * character, replace one and delete another. The number of changes is 4206 * limited by su->su_maxscore. 4207 * 4208 * After implementing this I noticed an article by Kemal Oflazer that 4209 * describes something similar: "Error-tolerant Finite State Recognition with 4210 * Applications to Morphological Analysis and Spelling Correction" (1996). 4211 * The implementation in the article is simplified and requires a stack of 4212 * unknown depth. The implementation here only needs a stack depth equal to 4213 * the length of the word. 4214 * 4215 * This is also used for the sound-folded word, "soundfold" is TRUE then. 4216 * The mechanism is the same, but we find a match with a sound-folded word 4217 * that comes from one or more original words. Each of these words may be 4218 * added, this is done by add_sound_suggest(). 4219 * Don't use: 4220 * the prefix tree or the keep-case tree 4221 * "su->su_badlen" 4222 * anything to do with upper and lower case 4223 * anything to do with word or non-word characters ("spell_iswordp()") 4224 * banned words 4225 * word flags (rare, region, compounding) 4226 * word splitting for now 4227 * "similar_chars()" 4228 * use "slang->sl_repsal" instead of "lp->lp_replang->sl_rep" 4229 */ 4230 static void 4231 suggest_trie_walk( 4232 suginfo_T *su, 4233 langp_T *lp, 4234 char_u *fword, 4235 int soundfold) 4236 { 4237 char_u tword[MAXWLEN]; /* good word collected so far */ 4238 trystate_T stack[MAXWLEN]; 4239 char_u preword[MAXWLEN * 3]; /* word found with proper case; 4240 * concatenation of prefix compound 4241 * words and split word. NUL terminated 4242 * when going deeper but not when coming 4243 * back. */ 4244 char_u compflags[MAXWLEN]; /* compound flags, one for each word */ 4245 trystate_T *sp; 4246 int newscore; 4247 int score; 4248 char_u *byts, *fbyts, *pbyts; 4249 idx_T *idxs, *fidxs, *pidxs; 4250 int depth; 4251 int c, c2, c3; 4252 int n = 0; 4253 int flags; 4254 garray_T *gap; 4255 idx_T arridx; 4256 int len; 4257 char_u *p; 4258 fromto_T *ftp; 4259 int fl = 0, tl; 4260 int repextra = 0; /* extra bytes in fword[] from REP item */ 4261 slang_T *slang = lp->lp_slang; 4262 int fword_ends; 4263 int goodword_ends; 4264 #ifdef DEBUG_TRIEWALK 4265 /* Stores the name of the change made at each level. */ 4266 char_u changename[MAXWLEN][80]; 4267 #endif 4268 int breakcheckcount = 1000; 4269 int compound_ok; 4270 4271 /* 4272 * Go through the whole case-fold tree, try changes at each node. 4273 * "tword[]" contains the word collected from nodes in the tree. 4274 * "fword[]" the word we are trying to match with (initially the bad 4275 * word). 4276 */ 4277 depth = 0; 4278 sp = &stack[0]; 4279 vim_memset(sp, 0, sizeof(trystate_T)); 4280 sp->ts_curi = 1; 4281 4282 if (soundfold) 4283 { 4284 /* Going through the soundfold tree. */ 4285 byts = fbyts = slang->sl_sbyts; 4286 idxs = fidxs = slang->sl_sidxs; 4287 pbyts = NULL; 4288 pidxs = NULL; 4289 sp->ts_prefixdepth = PFD_NOPREFIX; 4290 sp->ts_state = STATE_START; 4291 } 4292 else 4293 { 4294 /* 4295 * When there are postponed prefixes we need to use these first. At 4296 * the end of the prefix we continue in the case-fold tree. 4297 */ 4298 fbyts = slang->sl_fbyts; 4299 fidxs = slang->sl_fidxs; 4300 pbyts = slang->sl_pbyts; 4301 pidxs = slang->sl_pidxs; 4302 if (pbyts != NULL) 4303 { 4304 byts = pbyts; 4305 idxs = pidxs; 4306 sp->ts_prefixdepth = PFD_PREFIXTREE; 4307 sp->ts_state = STATE_NOPREFIX; /* try without prefix first */ 4308 } 4309 else 4310 { 4311 byts = fbyts; 4312 idxs = fidxs; 4313 sp->ts_prefixdepth = PFD_NOPREFIX; 4314 sp->ts_state = STATE_START; 4315 } 4316 } 4317 4318 /* 4319 * Loop to find all suggestions. At each round we either: 4320 * - For the current state try one operation, advance "ts_curi", 4321 * increase "depth". 4322 * - When a state is done go to the next, set "ts_state". 4323 * - When all states are tried decrease "depth". 4324 */ 4325 while (depth >= 0 && !got_int) 4326 { 4327 sp = &stack[depth]; 4328 switch (sp->ts_state) 4329 { 4330 case STATE_START: 4331 case STATE_NOPREFIX: 4332 /* 4333 * Start of node: Deal with NUL bytes, which means 4334 * tword[] may end here. 4335 */ 4336 arridx = sp->ts_arridx; /* current node in the tree */ 4337 len = byts[arridx]; /* bytes in this node */ 4338 arridx += sp->ts_curi; /* index of current byte */ 4339 4340 if (sp->ts_prefixdepth == PFD_PREFIXTREE) 4341 { 4342 /* Skip over the NUL bytes, we use them later. */ 4343 for (n = 0; n < len && byts[arridx + n] == 0; ++n) 4344 ; 4345 sp->ts_curi += n; 4346 4347 /* Always past NUL bytes now. */ 4348 n = (int)sp->ts_state; 4349 PROF_STORE(sp->ts_state) 4350 sp->ts_state = STATE_ENDNUL; 4351 sp->ts_save_badflags = su->su_badflags; 4352 4353 /* At end of a prefix or at start of prefixtree: check for 4354 * following word. */ 4355 if (byts[arridx] == 0 || n == (int)STATE_NOPREFIX) 4356 { 4357 /* Set su->su_badflags to the caps type at this position. 4358 * Use the caps type until here for the prefix itself. */ 4359 if (has_mbyte) 4360 n = nofold_len(fword, sp->ts_fidx, su->su_badptr); 4361 else 4362 n = sp->ts_fidx; 4363 flags = badword_captype(su->su_badptr, su->su_badptr + n); 4364 su->su_badflags = badword_captype(su->su_badptr + n, 4365 su->su_badptr + su->su_badlen); 4366 #ifdef DEBUG_TRIEWALK 4367 sprintf(changename[depth], "prefix"); 4368 #endif 4369 go_deeper(stack, depth, 0); 4370 ++depth; 4371 sp = &stack[depth]; 4372 sp->ts_prefixdepth = depth - 1; 4373 byts = fbyts; 4374 idxs = fidxs; 4375 sp->ts_arridx = 0; 4376 4377 /* Move the prefix to preword[] with the right case 4378 * and make find_keepcap_word() works. */ 4379 tword[sp->ts_twordlen] = NUL; 4380 make_case_word(tword + sp->ts_splitoff, 4381 preword + sp->ts_prewordlen, flags); 4382 sp->ts_prewordlen = (char_u)STRLEN(preword); 4383 sp->ts_splitoff = sp->ts_twordlen; 4384 } 4385 break; 4386 } 4387 4388 if (sp->ts_curi > len || byts[arridx] != 0) 4389 { 4390 /* Past bytes in node and/or past NUL bytes. */ 4391 PROF_STORE(sp->ts_state) 4392 sp->ts_state = STATE_ENDNUL; 4393 sp->ts_save_badflags = su->su_badflags; 4394 break; 4395 } 4396 4397 /* 4398 * End of word in tree. 4399 */ 4400 ++sp->ts_curi; /* eat one NUL byte */ 4401 4402 flags = (int)idxs[arridx]; 4403 4404 /* Skip words with the NOSUGGEST flag. */ 4405 if (flags & WF_NOSUGGEST) 4406 break; 4407 4408 fword_ends = (fword[sp->ts_fidx] == NUL 4409 || (soundfold 4410 ? VIM_ISWHITE(fword[sp->ts_fidx]) 4411 : !spell_iswordp(fword + sp->ts_fidx, curwin))); 4412 tword[sp->ts_twordlen] = NUL; 4413 4414 if (sp->ts_prefixdepth <= PFD_NOTSPECIAL 4415 && (sp->ts_flags & TSF_PREFIXOK) == 0) 4416 { 4417 /* There was a prefix before the word. Check that the prefix 4418 * can be used with this word. */ 4419 /* Count the length of the NULs in the prefix. If there are 4420 * none this must be the first try without a prefix. */ 4421 n = stack[sp->ts_prefixdepth].ts_arridx; 4422 len = pbyts[n++]; 4423 for (c = 0; c < len && pbyts[n + c] == 0; ++c) 4424 ; 4425 if (c > 0) 4426 { 4427 c = valid_word_prefix(c, n, flags, 4428 tword + sp->ts_splitoff, slang, FALSE); 4429 if (c == 0) 4430 break; 4431 4432 /* Use the WF_RARE flag for a rare prefix. */ 4433 if (c & WF_RAREPFX) 4434 flags |= WF_RARE; 4435 4436 /* Tricky: when checking for both prefix and compounding 4437 * we run into the prefix flag first. 4438 * Remember that it's OK, so that we accept the prefix 4439 * when arriving at a compound flag. */ 4440 sp->ts_flags |= TSF_PREFIXOK; 4441 } 4442 } 4443 4444 /* Check NEEDCOMPOUND: can't use word without compounding. Do try 4445 * appending another compound word below. */ 4446 if (sp->ts_complen == sp->ts_compsplit && fword_ends 4447 && (flags & WF_NEEDCOMP)) 4448 goodword_ends = FALSE; 4449 else 4450 goodword_ends = TRUE; 4451 4452 p = NULL; 4453 compound_ok = TRUE; 4454 if (sp->ts_complen > sp->ts_compsplit) 4455 { 4456 if (slang->sl_nobreak) 4457 { 4458 /* There was a word before this word. When there was no 4459 * change in this word (it was correct) add the first word 4460 * as a suggestion. If this word was corrected too, we 4461 * need to check if a correct word follows. */ 4462 if (sp->ts_fidx - sp->ts_splitfidx 4463 == sp->ts_twordlen - sp->ts_splitoff 4464 && STRNCMP(fword + sp->ts_splitfidx, 4465 tword + sp->ts_splitoff, 4466 sp->ts_fidx - sp->ts_splitfidx) == 0) 4467 { 4468 preword[sp->ts_prewordlen] = NUL; 4469 newscore = score_wordcount_adj(slang, sp->ts_score, 4470 preword + sp->ts_prewordlen, 4471 sp->ts_prewordlen > 0); 4472 /* Add the suggestion if the score isn't too bad. */ 4473 if (newscore <= su->su_maxscore) 4474 add_suggestion(su, &su->su_ga, preword, 4475 sp->ts_splitfidx - repextra, 4476 newscore, 0, FALSE, 4477 lp->lp_sallang, FALSE); 4478 break; 4479 } 4480 } 4481 else 4482 { 4483 /* There was a compound word before this word. If this 4484 * word does not support compounding then give up 4485 * (splitting is tried for the word without compound 4486 * flag). */ 4487 if (((unsigned)flags >> 24) == 0 4488 || sp->ts_twordlen - sp->ts_splitoff 4489 < slang->sl_compminlen) 4490 break; 4491 /* For multi-byte chars check character length against 4492 * COMPOUNDMIN. */ 4493 if (has_mbyte 4494 && slang->sl_compminlen > 0 4495 && mb_charlen(tword + sp->ts_splitoff) 4496 < slang->sl_compminlen) 4497 break; 4498 4499 compflags[sp->ts_complen] = ((unsigned)flags >> 24); 4500 compflags[sp->ts_complen + 1] = NUL; 4501 vim_strncpy(preword + sp->ts_prewordlen, 4502 tword + sp->ts_splitoff, 4503 sp->ts_twordlen - sp->ts_splitoff); 4504 4505 /* Verify CHECKCOMPOUNDPATTERN rules. */ 4506 if (match_checkcompoundpattern(preword, sp->ts_prewordlen, 4507 &slang->sl_comppat)) 4508 compound_ok = FALSE; 4509 4510 if (compound_ok) 4511 { 4512 p = preword; 4513 while (*skiptowhite(p) != NUL) 4514 p = skipwhite(skiptowhite(p)); 4515 if (fword_ends && !can_compound(slang, p, 4516 compflags + sp->ts_compsplit)) 4517 /* Compound is not allowed. But it may still be 4518 * possible if we add another (short) word. */ 4519 compound_ok = FALSE; 4520 } 4521 4522 /* Get pointer to last char of previous word. */ 4523 p = preword + sp->ts_prewordlen; 4524 MB_PTR_BACK(preword, p); 4525 } 4526 } 4527 4528 /* 4529 * Form the word with proper case in preword. 4530 * If there is a word from a previous split, append. 4531 * For the soundfold tree don't change the case, simply append. 4532 */ 4533 if (soundfold) 4534 STRCPY(preword + sp->ts_prewordlen, tword + sp->ts_splitoff); 4535 else if (flags & WF_KEEPCAP) 4536 /* Must find the word in the keep-case tree. */ 4537 find_keepcap_word(slang, tword + sp->ts_splitoff, 4538 preword + sp->ts_prewordlen); 4539 else 4540 { 4541 /* Include badflags: If the badword is onecap or allcap 4542 * use that for the goodword too. But if the badword is 4543 * allcap and it's only one char long use onecap. */ 4544 c = su->su_badflags; 4545 if ((c & WF_ALLCAP) 4546 && su->su_badlen == (*mb_ptr2len)(su->su_badptr)) 4547 c = WF_ONECAP; 4548 c |= flags; 4549 4550 /* When appending a compound word after a word character don't 4551 * use Onecap. */ 4552 if (p != NULL && spell_iswordp_nmw(p, curwin)) 4553 c &= ~WF_ONECAP; 4554 make_case_word(tword + sp->ts_splitoff, 4555 preword + sp->ts_prewordlen, c); 4556 } 4557 4558 if (!soundfold) 4559 { 4560 /* Don't use a banned word. It may appear again as a good 4561 * word, thus remember it. */ 4562 if (flags & WF_BANNED) 4563 { 4564 add_banned(su, preword + sp->ts_prewordlen); 4565 break; 4566 } 4567 if ((sp->ts_complen == sp->ts_compsplit 4568 && WAS_BANNED(su, preword + sp->ts_prewordlen)) 4569 || WAS_BANNED(su, preword)) 4570 { 4571 if (slang->sl_compprog == NULL) 4572 break; 4573 /* the word so far was banned but we may try compounding */ 4574 goodword_ends = FALSE; 4575 } 4576 } 4577 4578 newscore = 0; 4579 if (!soundfold) /* soundfold words don't have flags */ 4580 { 4581 if ((flags & WF_REGION) 4582 && (((unsigned)flags >> 16) & lp->lp_region) == 0) 4583 newscore += SCORE_REGION; 4584 if (flags & WF_RARE) 4585 newscore += SCORE_RARE; 4586 4587 if (!spell_valid_case(su->su_badflags, 4588 captype(preword + sp->ts_prewordlen, NULL))) 4589 newscore += SCORE_ICASE; 4590 } 4591 4592 /* TODO: how about splitting in the soundfold tree? */ 4593 if (fword_ends 4594 && goodword_ends 4595 && sp->ts_fidx >= sp->ts_fidxtry 4596 && compound_ok) 4597 { 4598 /* The badword also ends: add suggestions. */ 4599 #ifdef DEBUG_TRIEWALK 4600 if (soundfold && STRCMP(preword, "smwrd") == 0) 4601 { 4602 int j; 4603 4604 /* print the stack of changes that brought us here */ 4605 smsg("------ %s -------", fword); 4606 for (j = 0; j < depth; ++j) 4607 smsg("%s", changename[j]); 4608 } 4609 #endif 4610 if (soundfold) 4611 { 4612 /* For soundfolded words we need to find the original 4613 * words, the edit distance and then add them. */ 4614 add_sound_suggest(su, preword, sp->ts_score, lp); 4615 } 4616 else if (sp->ts_fidx > 0) 4617 { 4618 /* Give a penalty when changing non-word char to word 4619 * char, e.g., "thes," -> "these". */ 4620 p = fword + sp->ts_fidx; 4621 MB_PTR_BACK(fword, p); 4622 if (!spell_iswordp(p, curwin)) 4623 { 4624 p = preword + STRLEN(preword); 4625 MB_PTR_BACK(preword, p); 4626 if (spell_iswordp(p, curwin)) 4627 newscore += SCORE_NONWORD; 4628 } 4629 4630 /* Give a bonus to words seen before. */ 4631 score = score_wordcount_adj(slang, 4632 sp->ts_score + newscore, 4633 preword + sp->ts_prewordlen, 4634 sp->ts_prewordlen > 0); 4635 4636 /* Add the suggestion if the score isn't too bad. */ 4637 if (score <= su->su_maxscore) 4638 { 4639 add_suggestion(su, &su->su_ga, preword, 4640 sp->ts_fidx - repextra, 4641 score, 0, FALSE, lp->lp_sallang, FALSE); 4642 4643 if (su->su_badflags & WF_MIXCAP) 4644 { 4645 /* We really don't know if the word should be 4646 * upper or lower case, add both. */ 4647 c = captype(preword, NULL); 4648 if (c == 0 || c == WF_ALLCAP) 4649 { 4650 make_case_word(tword + sp->ts_splitoff, 4651 preword + sp->ts_prewordlen, 4652 c == 0 ? WF_ALLCAP : 0); 4653 4654 add_suggestion(su, &su->su_ga, preword, 4655 sp->ts_fidx - repextra, 4656 score + SCORE_ICASE, 0, FALSE, 4657 lp->lp_sallang, FALSE); 4658 } 4659 } 4660 } 4661 } 4662 } 4663 4664 /* 4665 * Try word split and/or compounding. 4666 */ 4667 if ((sp->ts_fidx >= sp->ts_fidxtry || fword_ends) 4668 /* Don't split halfway a character. */ 4669 && (!has_mbyte || sp->ts_tcharlen == 0)) 4670 { 4671 int try_compound; 4672 int try_split; 4673 4674 /* If past the end of the bad word don't try a split. 4675 * Otherwise try changing the next word. E.g., find 4676 * suggestions for "the the" where the second "the" is 4677 * different. It's done like a split. 4678 * TODO: word split for soundfold words */ 4679 try_split = (sp->ts_fidx - repextra < su->su_badlen) 4680 && !soundfold; 4681 4682 /* Get here in several situations: 4683 * 1. The word in the tree ends: 4684 * If the word allows compounding try that. Otherwise try 4685 * a split by inserting a space. For both check that a 4686 * valid words starts at fword[sp->ts_fidx]. 4687 * For NOBREAK do like compounding to be able to check if 4688 * the next word is valid. 4689 * 2. The badword does end, but it was due to a change (e.g., 4690 * a swap). No need to split, but do check that the 4691 * following word is valid. 4692 * 3. The badword and the word in the tree end. It may still 4693 * be possible to compound another (short) word. 4694 */ 4695 try_compound = FALSE; 4696 if (!soundfold 4697 && !slang->sl_nocompoundsugs 4698 && slang->sl_compprog != NULL 4699 && ((unsigned)flags >> 24) != 0 4700 && sp->ts_twordlen - sp->ts_splitoff 4701 >= slang->sl_compminlen 4702 && (!has_mbyte 4703 || slang->sl_compminlen == 0 4704 || mb_charlen(tword + sp->ts_splitoff) 4705 >= slang->sl_compminlen) 4706 && (slang->sl_compsylmax < MAXWLEN 4707 || sp->ts_complen + 1 - sp->ts_compsplit 4708 < slang->sl_compmax) 4709 && (can_be_compound(sp, slang, 4710 compflags, ((unsigned)flags >> 24)))) 4711 4712 { 4713 try_compound = TRUE; 4714 compflags[sp->ts_complen] = ((unsigned)flags >> 24); 4715 compflags[sp->ts_complen + 1] = NUL; 4716 } 4717 4718 /* For NOBREAK we never try splitting, it won't make any word 4719 * valid. */ 4720 if (slang->sl_nobreak && !slang->sl_nocompoundsugs) 4721 try_compound = TRUE; 4722 4723 /* If we could add a compound word, and it's also possible to 4724 * split at this point, do the split first and set 4725 * TSF_DIDSPLIT to avoid doing it again. */ 4726 else if (!fword_ends 4727 && try_compound 4728 && (sp->ts_flags & TSF_DIDSPLIT) == 0) 4729 { 4730 try_compound = FALSE; 4731 sp->ts_flags |= TSF_DIDSPLIT; 4732 --sp->ts_curi; /* do the same NUL again */ 4733 compflags[sp->ts_complen] = NUL; 4734 } 4735 else 4736 sp->ts_flags &= ~TSF_DIDSPLIT; 4737 4738 if (try_split || try_compound) 4739 { 4740 if (!try_compound && (!fword_ends || !goodword_ends)) 4741 { 4742 /* If we're going to split need to check that the 4743 * words so far are valid for compounding. If there 4744 * is only one word it must not have the NEEDCOMPOUND 4745 * flag. */ 4746 if (sp->ts_complen == sp->ts_compsplit 4747 && (flags & WF_NEEDCOMP)) 4748 break; 4749 p = preword; 4750 while (*skiptowhite(p) != NUL) 4751 p = skipwhite(skiptowhite(p)); 4752 if (sp->ts_complen > sp->ts_compsplit 4753 && !can_compound(slang, p, 4754 compflags + sp->ts_compsplit)) 4755 break; 4756 4757 if (slang->sl_nosplitsugs) 4758 newscore += SCORE_SPLIT_NO; 4759 else 4760 newscore += SCORE_SPLIT; 4761 4762 /* Give a bonus to words seen before. */ 4763 newscore = score_wordcount_adj(slang, newscore, 4764 preword + sp->ts_prewordlen, TRUE); 4765 } 4766 4767 if (TRY_DEEPER(su, stack, depth, newscore)) 4768 { 4769 go_deeper(stack, depth, newscore); 4770 #ifdef DEBUG_TRIEWALK 4771 if (!try_compound && !fword_ends) 4772 sprintf(changename[depth], "%.*s-%s: split", 4773 sp->ts_twordlen, tword, fword + sp->ts_fidx); 4774 else 4775 sprintf(changename[depth], "%.*s-%s: compound", 4776 sp->ts_twordlen, tword, fword + sp->ts_fidx); 4777 #endif 4778 /* Save things to be restored at STATE_SPLITUNDO. */ 4779 sp->ts_save_badflags = su->su_badflags; 4780 PROF_STORE(sp->ts_state) 4781 sp->ts_state = STATE_SPLITUNDO; 4782 4783 ++depth; 4784 sp = &stack[depth]; 4785 4786 /* Append a space to preword when splitting. */ 4787 if (!try_compound && !fword_ends) 4788 STRCAT(preword, " "); 4789 sp->ts_prewordlen = (char_u)STRLEN(preword); 4790 sp->ts_splitoff = sp->ts_twordlen; 4791 sp->ts_splitfidx = sp->ts_fidx; 4792 4793 /* If the badword has a non-word character at this 4794 * position skip it. That means replacing the 4795 * non-word character with a space. Always skip a 4796 * character when the word ends. But only when the 4797 * good word can end. */ 4798 if (((!try_compound && !spell_iswordp_nmw(fword 4799 + sp->ts_fidx, 4800 curwin)) 4801 || fword_ends) 4802 && fword[sp->ts_fidx] != NUL 4803 && goodword_ends) 4804 { 4805 int l; 4806 4807 l = MB_PTR2LEN(fword + sp->ts_fidx); 4808 if (fword_ends) 4809 { 4810 /* Copy the skipped character to preword. */ 4811 mch_memmove(preword + sp->ts_prewordlen, 4812 fword + sp->ts_fidx, l); 4813 sp->ts_prewordlen += l; 4814 preword[sp->ts_prewordlen] = NUL; 4815 } 4816 else 4817 sp->ts_score -= SCORE_SPLIT - SCORE_SUBST; 4818 sp->ts_fidx += l; 4819 } 4820 4821 /* When compounding include compound flag in 4822 * compflags[] (already set above). When splitting we 4823 * may start compounding over again. */ 4824 if (try_compound) 4825 ++sp->ts_complen; 4826 else 4827 sp->ts_compsplit = sp->ts_complen; 4828 sp->ts_prefixdepth = PFD_NOPREFIX; 4829 4830 /* set su->su_badflags to the caps type at this 4831 * position */ 4832 if (has_mbyte) 4833 n = nofold_len(fword, sp->ts_fidx, su->su_badptr); 4834 else 4835 n = sp->ts_fidx; 4836 su->su_badflags = badword_captype(su->su_badptr + n, 4837 su->su_badptr + su->su_badlen); 4838 4839 /* Restart at top of the tree. */ 4840 sp->ts_arridx = 0; 4841 4842 /* If there are postponed prefixes, try these too. */ 4843 if (pbyts != NULL) 4844 { 4845 byts = pbyts; 4846 idxs = pidxs; 4847 sp->ts_prefixdepth = PFD_PREFIXTREE; 4848 PROF_STORE(sp->ts_state) 4849 sp->ts_state = STATE_NOPREFIX; 4850 } 4851 } 4852 } 4853 } 4854 break; 4855 4856 case STATE_SPLITUNDO: 4857 /* Undo the changes done for word split or compound word. */ 4858 su->su_badflags = sp->ts_save_badflags; 4859 4860 /* Continue looking for NUL bytes. */ 4861 PROF_STORE(sp->ts_state) 4862 sp->ts_state = STATE_START; 4863 4864 /* In case we went into the prefix tree. */ 4865 byts = fbyts; 4866 idxs = fidxs; 4867 break; 4868 4869 case STATE_ENDNUL: 4870 /* Past the NUL bytes in the node. */ 4871 su->su_badflags = sp->ts_save_badflags; 4872 if (fword[sp->ts_fidx] == NUL && sp->ts_tcharlen == 0) 4873 { 4874 /* The badword ends, can't use STATE_PLAIN. */ 4875 PROF_STORE(sp->ts_state) 4876 sp->ts_state = STATE_DEL; 4877 break; 4878 } 4879 PROF_STORE(sp->ts_state) 4880 sp->ts_state = STATE_PLAIN; 4881 /* FALLTHROUGH */ 4882 4883 case STATE_PLAIN: 4884 /* 4885 * Go over all possible bytes at this node, add each to tword[] 4886 * and use child node. "ts_curi" is the index. 4887 */ 4888 arridx = sp->ts_arridx; 4889 if (sp->ts_curi > byts[arridx]) 4890 { 4891 /* Done all bytes at this node, do next state. When still at 4892 * already changed bytes skip the other tricks. */ 4893 PROF_STORE(sp->ts_state) 4894 if (sp->ts_fidx >= sp->ts_fidxtry) 4895 sp->ts_state = STATE_DEL; 4896 else 4897 sp->ts_state = STATE_FINAL; 4898 } 4899 else 4900 { 4901 arridx += sp->ts_curi++; 4902 c = byts[arridx]; 4903 4904 /* Normal byte, go one level deeper. If it's not equal to the 4905 * byte in the bad word adjust the score. But don't even try 4906 * when the byte was already changed. And don't try when we 4907 * just deleted this byte, accepting it is always cheaper than 4908 * delete + substitute. */ 4909 if (c == fword[sp->ts_fidx] 4910 || (sp->ts_tcharlen > 0 && sp->ts_isdiff != DIFF_NONE)) 4911 newscore = 0; 4912 else 4913 newscore = SCORE_SUBST; 4914 if ((newscore == 0 4915 || (sp->ts_fidx >= sp->ts_fidxtry 4916 && ((sp->ts_flags & TSF_DIDDEL) == 0 4917 || c != fword[sp->ts_delidx]))) 4918 && TRY_DEEPER(su, stack, depth, newscore)) 4919 { 4920 go_deeper(stack, depth, newscore); 4921 #ifdef DEBUG_TRIEWALK 4922 if (newscore > 0) 4923 sprintf(changename[depth], "%.*s-%s: subst %c to %c", 4924 sp->ts_twordlen, tword, fword + sp->ts_fidx, 4925 fword[sp->ts_fidx], c); 4926 else 4927 sprintf(changename[depth], "%.*s-%s: accept %c", 4928 sp->ts_twordlen, tword, fword + sp->ts_fidx, 4929 fword[sp->ts_fidx]); 4930 #endif 4931 ++depth; 4932 sp = &stack[depth]; 4933 ++sp->ts_fidx; 4934 tword[sp->ts_twordlen++] = c; 4935 sp->ts_arridx = idxs[arridx]; 4936 if (newscore == SCORE_SUBST) 4937 sp->ts_isdiff = DIFF_YES; 4938 if (has_mbyte) 4939 { 4940 /* Multi-byte characters are a bit complicated to 4941 * handle: They differ when any of the bytes differ 4942 * and then their length may also differ. */ 4943 if (sp->ts_tcharlen == 0) 4944 { 4945 /* First byte. */ 4946 sp->ts_tcharidx = 0; 4947 sp->ts_tcharlen = MB_BYTE2LEN(c); 4948 sp->ts_fcharstart = sp->ts_fidx - 1; 4949 sp->ts_isdiff = (newscore != 0) 4950 ? DIFF_YES : DIFF_NONE; 4951 } 4952 else if (sp->ts_isdiff == DIFF_INSERT) 4953 /* When inserting trail bytes don't advance in the 4954 * bad word. */ 4955 --sp->ts_fidx; 4956 if (++sp->ts_tcharidx == sp->ts_tcharlen) 4957 { 4958 /* Last byte of character. */ 4959 if (sp->ts_isdiff == DIFF_YES) 4960 { 4961 /* Correct ts_fidx for the byte length of the 4962 * character (we didn't check that before). */ 4963 sp->ts_fidx = sp->ts_fcharstart 4964 + MB_PTR2LEN( 4965 fword + sp->ts_fcharstart); 4966 /* For changing a composing character adjust 4967 * the score from SCORE_SUBST to 4968 * SCORE_SUBCOMP. */ 4969 if (enc_utf8 4970 && utf_iscomposing( 4971 utf_ptr2char(tword 4972 + sp->ts_twordlen 4973 - sp->ts_tcharlen)) 4974 && utf_iscomposing( 4975 utf_ptr2char(fword 4976 + sp->ts_fcharstart))) 4977 sp->ts_score -= 4978 SCORE_SUBST - SCORE_SUBCOMP; 4979 4980 /* For a similar character adjust score from 4981 * SCORE_SUBST to SCORE_SIMILAR. */ 4982 else if (!soundfold 4983 && slang->sl_has_map 4984 && similar_chars(slang, 4985 mb_ptr2char(tword 4986 + sp->ts_twordlen 4987 - sp->ts_tcharlen), 4988 mb_ptr2char(fword 4989 + sp->ts_fcharstart))) 4990 sp->ts_score -= 4991 SCORE_SUBST - SCORE_SIMILAR; 4992 } 4993 else if (sp->ts_isdiff == DIFF_INSERT 4994 && sp->ts_twordlen > sp->ts_tcharlen) 4995 { 4996 p = tword + sp->ts_twordlen - sp->ts_tcharlen; 4997 c = mb_ptr2char(p); 4998 if (enc_utf8 && utf_iscomposing(c)) 4999 { 5000 /* Inserting a composing char doesn't 5001 * count that much. */ 5002 sp->ts_score -= SCORE_INS - SCORE_INSCOMP; 5003 } 5004 else 5005 { 5006 /* If the previous character was the same, 5007 * thus doubling a character, give a bonus 5008 * to the score. Also for the soundfold 5009 * tree (might seem illogical but does 5010 * give better scores). */ 5011 MB_PTR_BACK(tword, p); 5012 if (c == mb_ptr2char(p)) 5013 sp->ts_score -= SCORE_INS 5014 - SCORE_INSDUP; 5015 } 5016 } 5017 5018 /* Starting a new char, reset the length. */ 5019 sp->ts_tcharlen = 0; 5020 } 5021 } 5022 else 5023 { 5024 /* If we found a similar char adjust the score. 5025 * We do this after calling go_deeper() because 5026 * it's slow. */ 5027 if (newscore != 0 5028 && !soundfold 5029 && slang->sl_has_map 5030 && similar_chars(slang, 5031 c, fword[sp->ts_fidx - 1])) 5032 sp->ts_score -= SCORE_SUBST - SCORE_SIMILAR; 5033 } 5034 } 5035 } 5036 break; 5037 5038 case STATE_DEL: 5039 /* When past the first byte of a multi-byte char don't try 5040 * delete/insert/swap a character. */ 5041 if (has_mbyte && sp->ts_tcharlen > 0) 5042 { 5043 PROF_STORE(sp->ts_state) 5044 sp->ts_state = STATE_FINAL; 5045 break; 5046 } 5047 /* 5048 * Try skipping one character in the bad word (delete it). 5049 */ 5050 PROF_STORE(sp->ts_state) 5051 sp->ts_state = STATE_INS_PREP; 5052 sp->ts_curi = 1; 5053 if (soundfold && sp->ts_fidx == 0 && fword[sp->ts_fidx] == '*') 5054 /* Deleting a vowel at the start of a word counts less, see 5055 * soundalike_score(). */ 5056 newscore = 2 * SCORE_DEL / 3; 5057 else 5058 newscore = SCORE_DEL; 5059 if (fword[sp->ts_fidx] != NUL 5060 && TRY_DEEPER(su, stack, depth, newscore)) 5061 { 5062 go_deeper(stack, depth, newscore); 5063 #ifdef DEBUG_TRIEWALK 5064 sprintf(changename[depth], "%.*s-%s: delete %c", 5065 sp->ts_twordlen, tword, fword + sp->ts_fidx, 5066 fword[sp->ts_fidx]); 5067 #endif 5068 ++depth; 5069 5070 /* Remember what character we deleted, so that we can avoid 5071 * inserting it again. */ 5072 stack[depth].ts_flags |= TSF_DIDDEL; 5073 stack[depth].ts_delidx = sp->ts_fidx; 5074 5075 /* Advance over the character in fword[]. Give a bonus to the 5076 * score if the same character is following "nn" -> "n". It's 5077 * a bit illogical for soundfold tree but it does give better 5078 * results. */ 5079 if (has_mbyte) 5080 { 5081 c = mb_ptr2char(fword + sp->ts_fidx); 5082 stack[depth].ts_fidx += MB_PTR2LEN(fword + sp->ts_fidx); 5083 if (enc_utf8 && utf_iscomposing(c)) 5084 stack[depth].ts_score -= SCORE_DEL - SCORE_DELCOMP; 5085 else if (c == mb_ptr2char(fword + stack[depth].ts_fidx)) 5086 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP; 5087 } 5088 else 5089 { 5090 ++stack[depth].ts_fidx; 5091 if (fword[sp->ts_fidx] == fword[sp->ts_fidx + 1]) 5092 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP; 5093 } 5094 break; 5095 } 5096 /* FALLTHROUGH */ 5097 5098 case STATE_INS_PREP: 5099 if (sp->ts_flags & TSF_DIDDEL) 5100 { 5101 /* If we just deleted a byte then inserting won't make sense, 5102 * a substitute is always cheaper. */ 5103 PROF_STORE(sp->ts_state) 5104 sp->ts_state = STATE_SWAP; 5105 break; 5106 } 5107 5108 /* skip over NUL bytes */ 5109 n = sp->ts_arridx; 5110 for (;;) 5111 { 5112 if (sp->ts_curi > byts[n]) 5113 { 5114 /* Only NUL bytes at this node, go to next state. */ 5115 PROF_STORE(sp->ts_state) 5116 sp->ts_state = STATE_SWAP; 5117 break; 5118 } 5119 if (byts[n + sp->ts_curi] != NUL) 5120 { 5121 /* Found a byte to insert. */ 5122 PROF_STORE(sp->ts_state) 5123 sp->ts_state = STATE_INS; 5124 break; 5125 } 5126 ++sp->ts_curi; 5127 } 5128 break; 5129 5130 /* FALLTHROUGH */ 5131 5132 case STATE_INS: 5133 /* Insert one byte. Repeat this for each possible byte at this 5134 * node. */ 5135 n = sp->ts_arridx; 5136 if (sp->ts_curi > byts[n]) 5137 { 5138 /* Done all bytes at this node, go to next state. */ 5139 PROF_STORE(sp->ts_state) 5140 sp->ts_state = STATE_SWAP; 5141 break; 5142 } 5143 5144 /* Do one more byte at this node, but: 5145 * - Skip NUL bytes. 5146 * - Skip the byte if it's equal to the byte in the word, 5147 * accepting that byte is always better. 5148 */ 5149 n += sp->ts_curi++; 5150 c = byts[n]; 5151 if (soundfold && sp->ts_twordlen == 0 && c == '*') 5152 /* Inserting a vowel at the start of a word counts less, 5153 * see soundalike_score(). */ 5154 newscore = 2 * SCORE_INS / 3; 5155 else 5156 newscore = SCORE_INS; 5157 if (c != fword[sp->ts_fidx] 5158 && TRY_DEEPER(su, stack, depth, newscore)) 5159 { 5160 go_deeper(stack, depth, newscore); 5161 #ifdef DEBUG_TRIEWALK 5162 sprintf(changename[depth], "%.*s-%s: insert %c", 5163 sp->ts_twordlen, tword, fword + sp->ts_fidx, 5164 c); 5165 #endif 5166 ++depth; 5167 sp = &stack[depth]; 5168 tword[sp->ts_twordlen++] = c; 5169 sp->ts_arridx = idxs[n]; 5170 if (has_mbyte) 5171 { 5172 fl = MB_BYTE2LEN(c); 5173 if (fl > 1) 5174 { 5175 /* There are following bytes for the same character. 5176 * We must find all bytes before trying 5177 * delete/insert/swap/etc. */ 5178 sp->ts_tcharlen = fl; 5179 sp->ts_tcharidx = 1; 5180 sp->ts_isdiff = DIFF_INSERT; 5181 } 5182 } 5183 else 5184 fl = 1; 5185 if (fl == 1) 5186 { 5187 /* If the previous character was the same, thus doubling a 5188 * character, give a bonus to the score. Also for 5189 * soundfold words (illogical but does give a better 5190 * score). */ 5191 if (sp->ts_twordlen >= 2 5192 && tword[sp->ts_twordlen - 2] == c) 5193 sp->ts_score -= SCORE_INS - SCORE_INSDUP; 5194 } 5195 } 5196 break; 5197 5198 case STATE_SWAP: 5199 /* 5200 * Swap two bytes in the bad word: "12" -> "21". 5201 * We change "fword" here, it's changed back afterwards at 5202 * STATE_UNSWAP. 5203 */ 5204 p = fword + sp->ts_fidx; 5205 c = *p; 5206 if (c == NUL) 5207 { 5208 /* End of word, can't swap or replace. */ 5209 PROF_STORE(sp->ts_state) 5210 sp->ts_state = STATE_FINAL; 5211 break; 5212 } 5213 5214 /* Don't swap if the first character is not a word character. 5215 * SWAP3 etc. also don't make sense then. */ 5216 if (!soundfold && !spell_iswordp(p, curwin)) 5217 { 5218 PROF_STORE(sp->ts_state) 5219 sp->ts_state = STATE_REP_INI; 5220 break; 5221 } 5222 5223 if (has_mbyte) 5224 { 5225 n = MB_CPTR2LEN(p); 5226 c = mb_ptr2char(p); 5227 if (p[n] == NUL) 5228 c2 = NUL; 5229 else if (!soundfold && !spell_iswordp(p + n, curwin)) 5230 c2 = c; /* don't swap non-word char */ 5231 else 5232 c2 = mb_ptr2char(p + n); 5233 } 5234 else 5235 { 5236 if (p[1] == NUL) 5237 c2 = NUL; 5238 else if (!soundfold && !spell_iswordp(p + 1, curwin)) 5239 c2 = c; /* don't swap non-word char */ 5240 else 5241 c2 = p[1]; 5242 } 5243 5244 /* When the second character is NUL we can't swap. */ 5245 if (c2 == NUL) 5246 { 5247 PROF_STORE(sp->ts_state) 5248 sp->ts_state = STATE_REP_INI; 5249 break; 5250 } 5251 5252 /* When characters are identical, swap won't do anything. 5253 * Also get here if the second char is not a word character. */ 5254 if (c == c2) 5255 { 5256 PROF_STORE(sp->ts_state) 5257 sp->ts_state = STATE_SWAP3; 5258 break; 5259 } 5260 if (c2 != NUL && TRY_DEEPER(su, stack, depth, SCORE_SWAP)) 5261 { 5262 go_deeper(stack, depth, SCORE_SWAP); 5263 #ifdef DEBUG_TRIEWALK 5264 sprintf(changename[depth], "%.*s-%s: swap %c and %c", 5265 sp->ts_twordlen, tword, fword + sp->ts_fidx, 5266 c, c2); 5267 #endif 5268 PROF_STORE(sp->ts_state) 5269 sp->ts_state = STATE_UNSWAP; 5270 ++depth; 5271 if (has_mbyte) 5272 { 5273 fl = mb_char2len(c2); 5274 mch_memmove(p, p + n, fl); 5275 mb_char2bytes(c, p + fl); 5276 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl; 5277 } 5278 else 5279 { 5280 p[0] = c2; 5281 p[1] = c; 5282 stack[depth].ts_fidxtry = sp->ts_fidx + 2; 5283 } 5284 } 5285 else 5286 { 5287 /* If this swap doesn't work then SWAP3 won't either. */ 5288 PROF_STORE(sp->ts_state) 5289 sp->ts_state = STATE_REP_INI; 5290 } 5291 break; 5292 5293 case STATE_UNSWAP: 5294 /* Undo the STATE_SWAP swap: "21" -> "12". */ 5295 p = fword + sp->ts_fidx; 5296 if (has_mbyte) 5297 { 5298 n = MB_PTR2LEN(p); 5299 c = mb_ptr2char(p + n); 5300 mch_memmove(p + MB_PTR2LEN(p + n), p, n); 5301 mb_char2bytes(c, p); 5302 } 5303 else 5304 { 5305 c = *p; 5306 *p = p[1]; 5307 p[1] = c; 5308 } 5309 /* FALLTHROUGH */ 5310 5311 case STATE_SWAP3: 5312 /* Swap two bytes, skipping one: "123" -> "321". We change 5313 * "fword" here, it's changed back afterwards at STATE_UNSWAP3. */ 5314 p = fword + sp->ts_fidx; 5315 if (has_mbyte) 5316 { 5317 n = MB_CPTR2LEN(p); 5318 c = mb_ptr2char(p); 5319 fl = MB_CPTR2LEN(p + n); 5320 c2 = mb_ptr2char(p + n); 5321 if (!soundfold && !spell_iswordp(p + n + fl, curwin)) 5322 c3 = c; /* don't swap non-word char */ 5323 else 5324 c3 = mb_ptr2char(p + n + fl); 5325 } 5326 else 5327 { 5328 c = *p; 5329 c2 = p[1]; 5330 if (!soundfold && !spell_iswordp(p + 2, curwin)) 5331 c3 = c; /* don't swap non-word char */ 5332 else 5333 c3 = p[2]; 5334 } 5335 5336 /* When characters are identical: "121" then SWAP3 result is 5337 * identical, ROT3L result is same as SWAP: "211", ROT3L result is 5338 * same as SWAP on next char: "112". Thus skip all swapping. 5339 * Also skip when c3 is NUL. 5340 * Also get here when the third character is not a word character. 5341 * Second character may any char: "a.b" -> "b.a" */ 5342 if (c == c3 || c3 == NUL) 5343 { 5344 PROF_STORE(sp->ts_state) 5345 sp->ts_state = STATE_REP_INI; 5346 break; 5347 } 5348 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3)) 5349 { 5350 go_deeper(stack, depth, SCORE_SWAP3); 5351 #ifdef DEBUG_TRIEWALK 5352 sprintf(changename[depth], "%.*s-%s: swap3 %c and %c", 5353 sp->ts_twordlen, tword, fword + sp->ts_fidx, 5354 c, c3); 5355 #endif 5356 PROF_STORE(sp->ts_state) 5357 sp->ts_state = STATE_UNSWAP3; 5358 ++depth; 5359 if (has_mbyte) 5360 { 5361 tl = mb_char2len(c3); 5362 mch_memmove(p, p + n + fl, tl); 5363 mb_char2bytes(c2, p + tl); 5364 mb_char2bytes(c, p + fl + tl); 5365 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl + tl; 5366 } 5367 else 5368 { 5369 p[0] = p[2]; 5370 p[2] = c; 5371 stack[depth].ts_fidxtry = sp->ts_fidx + 3; 5372 } 5373 } 5374 else 5375 { 5376 PROF_STORE(sp->ts_state) 5377 sp->ts_state = STATE_REP_INI; 5378 } 5379 break; 5380 5381 case STATE_UNSWAP3: 5382 /* Undo STATE_SWAP3: "321" -> "123" */ 5383 p = fword + sp->ts_fidx; 5384 if (has_mbyte) 5385 { 5386 n = MB_PTR2LEN(p); 5387 c2 = mb_ptr2char(p + n); 5388 fl = MB_PTR2LEN(p + n); 5389 c = mb_ptr2char(p + n + fl); 5390 tl = MB_PTR2LEN(p + n + fl); 5391 mch_memmove(p + fl + tl, p, n); 5392 mb_char2bytes(c, p); 5393 mb_char2bytes(c2, p + tl); 5394 p = p + tl; 5395 } 5396 else 5397 { 5398 c = *p; 5399 *p = p[2]; 5400 p[2] = c; 5401 ++p; 5402 } 5403 5404 if (!soundfold && !spell_iswordp(p, curwin)) 5405 { 5406 /* Middle char is not a word char, skip the rotate. First and 5407 * third char were already checked at swap and swap3. */ 5408 PROF_STORE(sp->ts_state) 5409 sp->ts_state = STATE_REP_INI; 5410 break; 5411 } 5412 5413 /* Rotate three characters left: "123" -> "231". We change 5414 * "fword" here, it's changed back afterwards at STATE_UNROT3L. */ 5415 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3)) 5416 { 5417 go_deeper(stack, depth, SCORE_SWAP3); 5418 #ifdef DEBUG_TRIEWALK 5419 p = fword + sp->ts_fidx; 5420 sprintf(changename[depth], "%.*s-%s: rotate left %c%c%c", 5421 sp->ts_twordlen, tword, fword + sp->ts_fidx, 5422 p[0], p[1], p[2]); 5423 #endif 5424 PROF_STORE(sp->ts_state) 5425 sp->ts_state = STATE_UNROT3L; 5426 ++depth; 5427 p = fword + sp->ts_fidx; 5428 if (has_mbyte) 5429 { 5430 n = MB_CPTR2LEN(p); 5431 c = mb_ptr2char(p); 5432 fl = MB_CPTR2LEN(p + n); 5433 fl += MB_CPTR2LEN(p + n + fl); 5434 mch_memmove(p, p + n, fl); 5435 mb_char2bytes(c, p + fl); 5436 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl; 5437 } 5438 else 5439 { 5440 c = *p; 5441 *p = p[1]; 5442 p[1] = p[2]; 5443 p[2] = c; 5444 stack[depth].ts_fidxtry = sp->ts_fidx + 3; 5445 } 5446 } 5447 else 5448 { 5449 PROF_STORE(sp->ts_state) 5450 sp->ts_state = STATE_REP_INI; 5451 } 5452 break; 5453 5454 case STATE_UNROT3L: 5455 /* Undo ROT3L: "231" -> "123" */ 5456 p = fword + sp->ts_fidx; 5457 if (has_mbyte) 5458 { 5459 n = MB_PTR2LEN(p); 5460 n += MB_PTR2LEN(p + n); 5461 c = mb_ptr2char(p + n); 5462 tl = MB_PTR2LEN(p + n); 5463 mch_memmove(p + tl, p, n); 5464 mb_char2bytes(c, p); 5465 } 5466 else 5467 { 5468 c = p[2]; 5469 p[2] = p[1]; 5470 p[1] = *p; 5471 *p = c; 5472 } 5473 5474 /* Rotate three bytes right: "123" -> "312". We change "fword" 5475 * here, it's changed back afterwards at STATE_UNROT3R. */ 5476 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3)) 5477 { 5478 go_deeper(stack, depth, SCORE_SWAP3); 5479 #ifdef DEBUG_TRIEWALK 5480 p = fword + sp->ts_fidx; 5481 sprintf(changename[depth], "%.*s-%s: rotate right %c%c%c", 5482 sp->ts_twordlen, tword, fword + sp->ts_fidx, 5483 p[0], p[1], p[2]); 5484 #endif 5485 PROF_STORE(sp->ts_state) 5486 sp->ts_state = STATE_UNROT3R; 5487 ++depth; 5488 p = fword + sp->ts_fidx; 5489 if (has_mbyte) 5490 { 5491 n = MB_CPTR2LEN(p); 5492 n += MB_CPTR2LEN(p + n); 5493 c = mb_ptr2char(p + n); 5494 tl = MB_CPTR2LEN(p + n); 5495 mch_memmove(p + tl, p, n); 5496 mb_char2bytes(c, p); 5497 stack[depth].ts_fidxtry = sp->ts_fidx + n + tl; 5498 } 5499 else 5500 { 5501 c = p[2]; 5502 p[2] = p[1]; 5503 p[1] = *p; 5504 *p = c; 5505 stack[depth].ts_fidxtry = sp->ts_fidx + 3; 5506 } 5507 } 5508 else 5509 { 5510 PROF_STORE(sp->ts_state) 5511 sp->ts_state = STATE_REP_INI; 5512 } 5513 break; 5514 5515 case STATE_UNROT3R: 5516 /* Undo ROT3R: "312" -> "123" */ 5517 p = fword + sp->ts_fidx; 5518 if (has_mbyte) 5519 { 5520 c = mb_ptr2char(p); 5521 tl = MB_PTR2LEN(p); 5522 n = MB_PTR2LEN(p + tl); 5523 n += MB_PTR2LEN(p + tl + n); 5524 mch_memmove(p, p + tl, n); 5525 mb_char2bytes(c, p + n); 5526 } 5527 else 5528 { 5529 c = *p; 5530 *p = p[1]; 5531 p[1] = p[2]; 5532 p[2] = c; 5533 } 5534 /* FALLTHROUGH */ 5535 5536 case STATE_REP_INI: 5537 /* Check if matching with REP items from the .aff file would work. 5538 * Quickly skip if: 5539 * - there are no REP items and we are not in the soundfold trie 5540 * - the score is going to be too high anyway 5541 * - already applied a REP item or swapped here */ 5542 if ((lp->lp_replang == NULL && !soundfold) 5543 || sp->ts_score + SCORE_REP >= su->su_maxscore 5544 || sp->ts_fidx < sp->ts_fidxtry) 5545 { 5546 PROF_STORE(sp->ts_state) 5547 sp->ts_state = STATE_FINAL; 5548 break; 5549 } 5550 5551 /* Use the first byte to quickly find the first entry that may 5552 * match. If the index is -1 there is none. */ 5553 if (soundfold) 5554 sp->ts_curi = slang->sl_repsal_first[fword[sp->ts_fidx]]; 5555 else 5556 sp->ts_curi = lp->lp_replang->sl_rep_first[fword[sp->ts_fidx]]; 5557 5558 if (sp->ts_curi < 0) 5559 { 5560 PROF_STORE(sp->ts_state) 5561 sp->ts_state = STATE_FINAL; 5562 break; 5563 } 5564 5565 PROF_STORE(sp->ts_state) 5566 sp->ts_state = STATE_REP; 5567 /* FALLTHROUGH */ 5568 5569 case STATE_REP: 5570 /* Try matching with REP items from the .aff file. For each match 5571 * replace the characters and check if the resulting word is 5572 * valid. */ 5573 p = fword + sp->ts_fidx; 5574 5575 if (soundfold) 5576 gap = &slang->sl_repsal; 5577 else 5578 gap = &lp->lp_replang->sl_rep; 5579 while (sp->ts_curi < gap->ga_len) 5580 { 5581 ftp = (fromto_T *)gap->ga_data + sp->ts_curi++; 5582 if (*ftp->ft_from != *p) 5583 { 5584 /* past possible matching entries */ 5585 sp->ts_curi = gap->ga_len; 5586 break; 5587 } 5588 if (STRNCMP(ftp->ft_from, p, STRLEN(ftp->ft_from)) == 0 5589 && TRY_DEEPER(su, stack, depth, SCORE_REP)) 5590 { 5591 go_deeper(stack, depth, SCORE_REP); 5592 #ifdef DEBUG_TRIEWALK 5593 sprintf(changename[depth], "%.*s-%s: replace %s with %s", 5594 sp->ts_twordlen, tword, fword + sp->ts_fidx, 5595 ftp->ft_from, ftp->ft_to); 5596 #endif 5597 /* Need to undo this afterwards. */ 5598 PROF_STORE(sp->ts_state) 5599 sp->ts_state = STATE_REP_UNDO; 5600 5601 /* Change the "from" to the "to" string. */ 5602 ++depth; 5603 fl = (int)STRLEN(ftp->ft_from); 5604 tl = (int)STRLEN(ftp->ft_to); 5605 if (fl != tl) 5606 { 5607 STRMOVE(p + tl, p + fl); 5608 repextra += tl - fl; 5609 } 5610 mch_memmove(p, ftp->ft_to, tl); 5611 stack[depth].ts_fidxtry = sp->ts_fidx + tl; 5612 stack[depth].ts_tcharlen = 0; 5613 break; 5614 } 5615 } 5616 5617 if (sp->ts_curi >= gap->ga_len && sp->ts_state == STATE_REP) 5618 { 5619 /* No (more) matches. */ 5620 PROF_STORE(sp->ts_state) 5621 sp->ts_state = STATE_FINAL; 5622 } 5623 5624 break; 5625 5626 case STATE_REP_UNDO: 5627 /* Undo a REP replacement and continue with the next one. */ 5628 if (soundfold) 5629 gap = &slang->sl_repsal; 5630 else 5631 gap = &lp->lp_replang->sl_rep; 5632 ftp = (fromto_T *)gap->ga_data + sp->ts_curi - 1; 5633 fl = (int)STRLEN(ftp->ft_from); 5634 tl = (int)STRLEN(ftp->ft_to); 5635 p = fword + sp->ts_fidx; 5636 if (fl != tl) 5637 { 5638 STRMOVE(p + fl, p + tl); 5639 repextra -= tl - fl; 5640 } 5641 mch_memmove(p, ftp->ft_from, fl); 5642 PROF_STORE(sp->ts_state) 5643 sp->ts_state = STATE_REP; 5644 break; 5645 5646 default: 5647 /* Did all possible states at this level, go up one level. */ 5648 --depth; 5649 5650 if (depth >= 0 && stack[depth].ts_prefixdepth == PFD_PREFIXTREE) 5651 { 5652 /* Continue in or go back to the prefix tree. */ 5653 byts = pbyts; 5654 idxs = pidxs; 5655 } 5656 5657 /* Don't check for CTRL-C too often, it takes time. */ 5658 if (--breakcheckcount == 0) 5659 { 5660 ui_breakcheck(); 5661 breakcheckcount = 1000; 5662 } 5663 } 5664 } 5665 } 5666 5667 5668 /* 5669 * Go one level deeper in the tree. 5670 */ 5671 static void 5672 go_deeper(trystate_T *stack, int depth, int score_add) 5673 { 5674 stack[depth + 1] = stack[depth]; 5675 stack[depth + 1].ts_state = STATE_START; 5676 stack[depth + 1].ts_score = stack[depth].ts_score + score_add; 5677 stack[depth + 1].ts_curi = 1; /* start just after length byte */ 5678 stack[depth + 1].ts_flags = 0; 5679 } 5680 5681 /* 5682 * Case-folding may change the number of bytes: Count nr of chars in 5683 * fword[flen] and return the byte length of that many chars in "word". 5684 */ 5685 static int 5686 nofold_len(char_u *fword, int flen, char_u *word) 5687 { 5688 char_u *p; 5689 int i = 0; 5690 5691 for (p = fword; p < fword + flen; MB_PTR_ADV(p)) 5692 ++i; 5693 for (p = word; i > 0; MB_PTR_ADV(p)) 5694 --i; 5695 return (int)(p - word); 5696 } 5697 5698 /* 5699 * "fword" is a good word with case folded. Find the matching keep-case 5700 * words and put it in "kword". 5701 * Theoretically there could be several keep-case words that result in the 5702 * same case-folded word, but we only find one... 5703 */ 5704 static void 5705 find_keepcap_word(slang_T *slang, char_u *fword, char_u *kword) 5706 { 5707 char_u uword[MAXWLEN]; /* "fword" in upper-case */ 5708 int depth; 5709 idx_T tryidx; 5710 5711 /* The following arrays are used at each depth in the tree. */ 5712 idx_T arridx[MAXWLEN]; 5713 int round[MAXWLEN]; 5714 int fwordidx[MAXWLEN]; 5715 int uwordidx[MAXWLEN]; 5716 int kwordlen[MAXWLEN]; 5717 5718 int flen, ulen; 5719 int l; 5720 int len; 5721 int c; 5722 idx_T lo, hi, m; 5723 char_u *p; 5724 char_u *byts = slang->sl_kbyts; /* array with bytes of the words */ 5725 idx_T *idxs = slang->sl_kidxs; /* array with indexes */ 5726 5727 if (byts == NULL) 5728 { 5729 /* array is empty: "cannot happen" */ 5730 *kword = NUL; 5731 return; 5732 } 5733 5734 /* Make an all-cap version of "fword". */ 5735 allcap_copy(fword, uword); 5736 5737 /* 5738 * Each character needs to be tried both case-folded and upper-case. 5739 * All this gets very complicated if we keep in mind that changing case 5740 * may change the byte length of a multi-byte character... 5741 */ 5742 depth = 0; 5743 arridx[0] = 0; 5744 round[0] = 0; 5745 fwordidx[0] = 0; 5746 uwordidx[0] = 0; 5747 kwordlen[0] = 0; 5748 while (depth >= 0) 5749 { 5750 if (fword[fwordidx[depth]] == NUL) 5751 { 5752 /* We are at the end of "fword". If the tree allows a word to end 5753 * here we have found a match. */ 5754 if (byts[arridx[depth] + 1] == 0) 5755 { 5756 kword[kwordlen[depth]] = NUL; 5757 return; 5758 } 5759 5760 /* kword is getting too long, continue one level up */ 5761 --depth; 5762 } 5763 else if (++round[depth] > 2) 5764 { 5765 /* tried both fold-case and upper-case character, continue one 5766 * level up */ 5767 --depth; 5768 } 5769 else 5770 { 5771 /* 5772 * round[depth] == 1: Try using the folded-case character. 5773 * round[depth] == 2: Try using the upper-case character. 5774 */ 5775 if (has_mbyte) 5776 { 5777 flen = MB_CPTR2LEN(fword + fwordidx[depth]); 5778 ulen = MB_CPTR2LEN(uword + uwordidx[depth]); 5779 } 5780 else 5781 ulen = flen = 1; 5782 if (round[depth] == 1) 5783 { 5784 p = fword + fwordidx[depth]; 5785 l = flen; 5786 } 5787 else 5788 { 5789 p = uword + uwordidx[depth]; 5790 l = ulen; 5791 } 5792 5793 for (tryidx = arridx[depth]; l > 0; --l) 5794 { 5795 /* Perform a binary search in the list of accepted bytes. */ 5796 len = byts[tryidx++]; 5797 c = *p++; 5798 lo = tryidx; 5799 hi = tryidx + len - 1; 5800 while (lo < hi) 5801 { 5802 m = (lo + hi) / 2; 5803 if (byts[m] > c) 5804 hi = m - 1; 5805 else if (byts[m] < c) 5806 lo = m + 1; 5807 else 5808 { 5809 lo = hi = m; 5810 break; 5811 } 5812 } 5813 5814 /* Stop if there is no matching byte. */ 5815 if (hi < lo || byts[lo] != c) 5816 break; 5817 5818 /* Continue at the child (if there is one). */ 5819 tryidx = idxs[lo]; 5820 } 5821 5822 if (l == 0) 5823 { 5824 /* 5825 * Found the matching char. Copy it to "kword" and go a 5826 * level deeper. 5827 */ 5828 if (round[depth] == 1) 5829 { 5830 STRNCPY(kword + kwordlen[depth], fword + fwordidx[depth], 5831 flen); 5832 kwordlen[depth + 1] = kwordlen[depth] + flen; 5833 } 5834 else 5835 { 5836 STRNCPY(kword + kwordlen[depth], uword + uwordidx[depth], 5837 ulen); 5838 kwordlen[depth + 1] = kwordlen[depth] + ulen; 5839 } 5840 fwordidx[depth + 1] = fwordidx[depth] + flen; 5841 uwordidx[depth + 1] = uwordidx[depth] + ulen; 5842 5843 ++depth; 5844 arridx[depth] = tryidx; 5845 round[depth] = 0; 5846 } 5847 } 5848 } 5849 5850 /* Didn't find it: "cannot happen". */ 5851 *kword = NUL; 5852 } 5853 5854 /* 5855 * Compute the sound-a-like score for suggestions in su->su_ga and add them to 5856 * su->su_sga. 5857 */ 5858 static void 5859 score_comp_sal(suginfo_T *su) 5860 { 5861 langp_T *lp; 5862 char_u badsound[MAXWLEN]; 5863 int i; 5864 suggest_T *stp; 5865 suggest_T *sstp; 5866 int score; 5867 int lpi; 5868 5869 if (ga_grow(&su->su_sga, su->su_ga.ga_len) == FAIL) 5870 return; 5871 5872 /* Use the sound-folding of the first language that supports it. */ 5873 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 5874 { 5875 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 5876 if (lp->lp_slang->sl_sal.ga_len > 0) 5877 { 5878 /* soundfold the bad word */ 5879 spell_soundfold(lp->lp_slang, su->su_fbadword, TRUE, badsound); 5880 5881 for (i = 0; i < su->su_ga.ga_len; ++i) 5882 { 5883 stp = &SUG(su->su_ga, i); 5884 5885 /* Case-fold the suggested word, sound-fold it and compute the 5886 * sound-a-like score. */ 5887 score = stp_sal_score(stp, su, lp->lp_slang, badsound); 5888 if (score < SCORE_MAXMAX) 5889 { 5890 /* Add the suggestion. */ 5891 sstp = &SUG(su->su_sga, su->su_sga.ga_len); 5892 sstp->st_word = vim_strsave(stp->st_word); 5893 if (sstp->st_word != NULL) 5894 { 5895 sstp->st_wordlen = stp->st_wordlen; 5896 sstp->st_score = score; 5897 sstp->st_altscore = 0; 5898 sstp->st_orglen = stp->st_orglen; 5899 ++su->su_sga.ga_len; 5900 } 5901 } 5902 } 5903 break; 5904 } 5905 } 5906 } 5907 5908 /* 5909 * Combine the list of suggestions in su->su_ga and su->su_sga. 5910 * They are entwined. 5911 */ 5912 static void 5913 score_combine(suginfo_T *su) 5914 { 5915 int i; 5916 int j; 5917 garray_T ga; 5918 garray_T *gap; 5919 langp_T *lp; 5920 suggest_T *stp; 5921 char_u *p; 5922 char_u badsound[MAXWLEN]; 5923 int round; 5924 int lpi; 5925 slang_T *slang = NULL; 5926 5927 /* Add the alternate score to su_ga. */ 5928 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 5929 { 5930 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 5931 if (lp->lp_slang->sl_sal.ga_len > 0) 5932 { 5933 /* soundfold the bad word */ 5934 slang = lp->lp_slang; 5935 spell_soundfold(slang, su->su_fbadword, TRUE, badsound); 5936 5937 for (i = 0; i < su->su_ga.ga_len; ++i) 5938 { 5939 stp = &SUG(su->su_ga, i); 5940 stp->st_altscore = stp_sal_score(stp, su, slang, badsound); 5941 if (stp->st_altscore == SCORE_MAXMAX) 5942 stp->st_score = (stp->st_score * 3 + SCORE_BIG) / 4; 5943 else 5944 stp->st_score = (stp->st_score * 3 5945 + stp->st_altscore) / 4; 5946 stp->st_salscore = FALSE; 5947 } 5948 break; 5949 } 5950 } 5951 5952 if (slang == NULL) /* Using "double" without sound folding. */ 5953 { 5954 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, 5955 su->su_maxcount); 5956 return; 5957 } 5958 5959 /* Add the alternate score to su_sga. */ 5960 for (i = 0; i < su->su_sga.ga_len; ++i) 5961 { 5962 stp = &SUG(su->su_sga, i); 5963 stp->st_altscore = spell_edit_score(slang, 5964 su->su_badword, stp->st_word); 5965 if (stp->st_score == SCORE_MAXMAX) 5966 stp->st_score = (SCORE_BIG * 7 + stp->st_altscore) / 8; 5967 else 5968 stp->st_score = (stp->st_score * 7 + stp->st_altscore) / 8; 5969 stp->st_salscore = TRUE; 5970 } 5971 5972 /* Remove bad suggestions, sort the suggestions and truncate at "maxcount" 5973 * for both lists. */ 5974 check_suggestions(su, &su->su_ga); 5975 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); 5976 check_suggestions(su, &su->su_sga); 5977 (void)cleanup_suggestions(&su->su_sga, su->su_maxscore, su->su_maxcount); 5978 5979 ga_init2(&ga, (int)sizeof(suginfo_T), 1); 5980 if (ga_grow(&ga, su->su_ga.ga_len + su->su_sga.ga_len) == FAIL) 5981 return; 5982 5983 stp = &SUG(ga, 0); 5984 for (i = 0; i < su->su_ga.ga_len || i < su->su_sga.ga_len; ++i) 5985 { 5986 /* round 1: get a suggestion from su_ga 5987 * round 2: get a suggestion from su_sga */ 5988 for (round = 1; round <= 2; ++round) 5989 { 5990 gap = round == 1 ? &su->su_ga : &su->su_sga; 5991 if (i < gap->ga_len) 5992 { 5993 /* Don't add a word if it's already there. */ 5994 p = SUG(*gap, i).st_word; 5995 for (j = 0; j < ga.ga_len; ++j) 5996 if (STRCMP(stp[j].st_word, p) == 0) 5997 break; 5998 if (j == ga.ga_len) 5999 stp[ga.ga_len++] = SUG(*gap, i); 6000 else 6001 vim_free(p); 6002 } 6003 } 6004 } 6005 6006 ga_clear(&su->su_ga); 6007 ga_clear(&su->su_sga); 6008 6009 /* Truncate the list to the number of suggestions that will be displayed. */ 6010 if (ga.ga_len > su->su_maxcount) 6011 { 6012 for (i = su->su_maxcount; i < ga.ga_len; ++i) 6013 vim_free(stp[i].st_word); 6014 ga.ga_len = su->su_maxcount; 6015 } 6016 6017 su->su_ga = ga; 6018 } 6019 6020 /* 6021 * For the goodword in "stp" compute the soundalike score compared to the 6022 * badword. 6023 */ 6024 static int 6025 stp_sal_score( 6026 suggest_T *stp, 6027 suginfo_T *su, 6028 slang_T *slang, 6029 char_u *badsound) /* sound-folded badword */ 6030 { 6031 char_u *p; 6032 char_u *pbad; 6033 char_u *pgood; 6034 char_u badsound2[MAXWLEN]; 6035 char_u fword[MAXWLEN]; 6036 char_u goodsound[MAXWLEN]; 6037 char_u goodword[MAXWLEN]; 6038 int lendiff; 6039 6040 lendiff = (int)(su->su_badlen - stp->st_orglen); 6041 if (lendiff >= 0) 6042 pbad = badsound; 6043 else 6044 { 6045 /* soundfold the bad word with more characters following */ 6046 (void)spell_casefold(su->su_badptr, stp->st_orglen, fword, MAXWLEN); 6047 6048 /* When joining two words the sound often changes a lot. E.g., "t he" 6049 * sounds like "t h" while "the" sounds like "@". Avoid that by 6050 * removing the space. Don't do it when the good word also contains a 6051 * space. */ 6052 if (VIM_ISWHITE(su->su_badptr[su->su_badlen]) 6053 && *skiptowhite(stp->st_word) == NUL) 6054 for (p = fword; *(p = skiptowhite(p)) != NUL; ) 6055 STRMOVE(p, p + 1); 6056 6057 spell_soundfold(slang, fword, TRUE, badsound2); 6058 pbad = badsound2; 6059 } 6060 6061 if (lendiff > 0 && stp->st_wordlen + lendiff < MAXWLEN) 6062 { 6063 /* Add part of the bad word to the good word, so that we soundfold 6064 * what replaces the bad word. */ 6065 STRCPY(goodword, stp->st_word); 6066 vim_strncpy(goodword + stp->st_wordlen, 6067 su->su_badptr + su->su_badlen - lendiff, lendiff); 6068 pgood = goodword; 6069 } 6070 else 6071 pgood = stp->st_word; 6072 6073 /* Sound-fold the word and compute the score for the difference. */ 6074 spell_soundfold(slang, pgood, FALSE, goodsound); 6075 6076 return soundalike_score(goodsound, pbad); 6077 } 6078 6079 /* structure used to store soundfolded words that add_sound_suggest() has 6080 * handled already. */ 6081 typedef struct 6082 { 6083 short sft_score; /* lowest score used */ 6084 char_u sft_word[1]; /* soundfolded word, actually longer */ 6085 } sftword_T; 6086 6087 static sftword_T dumsft; 6088 #define HIKEY2SFT(p) ((sftword_T *)(p - (dumsft.sft_word - (char_u *)&dumsft))) 6089 #define HI2SFT(hi) HIKEY2SFT((hi)->hi_key) 6090 6091 /* 6092 * Prepare for calling suggest_try_soundalike(). 6093 */ 6094 static void 6095 suggest_try_soundalike_prep(void) 6096 { 6097 langp_T *lp; 6098 int lpi; 6099 slang_T *slang; 6100 6101 /* Do this for all languages that support sound folding and for which a 6102 * .sug file has been loaded. */ 6103 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 6104 { 6105 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 6106 slang = lp->lp_slang; 6107 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL) 6108 /* prepare the hashtable used by add_sound_suggest() */ 6109 hash_init(&slang->sl_sounddone); 6110 } 6111 } 6112 6113 /* 6114 * Find suggestions by comparing the word in a sound-a-like form. 6115 * Note: This doesn't support postponed prefixes. 6116 */ 6117 static void 6118 suggest_try_soundalike(suginfo_T *su) 6119 { 6120 char_u salword[MAXWLEN]; 6121 langp_T *lp; 6122 int lpi; 6123 slang_T *slang; 6124 6125 /* Do this for all languages that support sound folding and for which a 6126 * .sug file has been loaded. */ 6127 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 6128 { 6129 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 6130 slang = lp->lp_slang; 6131 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL) 6132 { 6133 /* soundfold the bad word */ 6134 spell_soundfold(slang, su->su_fbadword, TRUE, salword); 6135 6136 /* try all kinds of inserts/deletes/swaps/etc. */ 6137 /* TODO: also soundfold the next words, so that we can try joining 6138 * and splitting */ 6139 #ifdef SUGGEST_PROFILE 6140 prof_init(); 6141 #endif 6142 suggest_trie_walk(su, lp, salword, TRUE); 6143 #ifdef SUGGEST_PROFILE 6144 prof_report("soundalike"); 6145 #endif 6146 } 6147 } 6148 } 6149 6150 /* 6151 * Finish up after calling suggest_try_soundalike(). 6152 */ 6153 static void 6154 suggest_try_soundalike_finish(void) 6155 { 6156 langp_T *lp; 6157 int lpi; 6158 slang_T *slang; 6159 int todo; 6160 hashitem_T *hi; 6161 6162 /* Do this for all languages that support sound folding and for which a 6163 * .sug file has been loaded. */ 6164 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 6165 { 6166 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 6167 slang = lp->lp_slang; 6168 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL) 6169 { 6170 /* Free the info about handled words. */ 6171 todo = (int)slang->sl_sounddone.ht_used; 6172 for (hi = slang->sl_sounddone.ht_array; todo > 0; ++hi) 6173 if (!HASHITEM_EMPTY(hi)) 6174 { 6175 vim_free(HI2SFT(hi)); 6176 --todo; 6177 } 6178 6179 /* Clear the hashtable, it may also be used by another region. */ 6180 hash_clear(&slang->sl_sounddone); 6181 hash_init(&slang->sl_sounddone); 6182 } 6183 } 6184 } 6185 6186 /* 6187 * A match with a soundfolded word is found. Add the good word(s) that 6188 * produce this soundfolded word. 6189 */ 6190 static void 6191 add_sound_suggest( 6192 suginfo_T *su, 6193 char_u *goodword, 6194 int score, /* soundfold score */ 6195 langp_T *lp) 6196 { 6197 slang_T *slang = lp->lp_slang; /* language for sound folding */ 6198 int sfwordnr; 6199 char_u *nrline; 6200 int orgnr; 6201 char_u theword[MAXWLEN]; 6202 int i; 6203 int wlen; 6204 char_u *byts; 6205 idx_T *idxs; 6206 int n; 6207 int wordcount; 6208 int wc; 6209 int goodscore; 6210 hash_T hash; 6211 hashitem_T *hi; 6212 sftword_T *sft; 6213 int bc, gc; 6214 int limit; 6215 6216 /* 6217 * It's very well possible that the same soundfold word is found several 6218 * times with different scores. Since the following is quite slow only do 6219 * the words that have a better score than before. Use a hashtable to 6220 * remember the words that have been done. 6221 */ 6222 hash = hash_hash(goodword); 6223 hi = hash_lookup(&slang->sl_sounddone, goodword, hash); 6224 if (HASHITEM_EMPTY(hi)) 6225 { 6226 sft = alloc(sizeof(sftword_T) + STRLEN(goodword)); 6227 if (sft != NULL) 6228 { 6229 sft->sft_score = score; 6230 STRCPY(sft->sft_word, goodword); 6231 hash_add_item(&slang->sl_sounddone, hi, sft->sft_word, hash); 6232 } 6233 } 6234 else 6235 { 6236 sft = HI2SFT(hi); 6237 if (score >= sft->sft_score) 6238 return; 6239 sft->sft_score = score; 6240 } 6241 6242 /* 6243 * Find the word nr in the soundfold tree. 6244 */ 6245 sfwordnr = soundfold_find(slang, goodword); 6246 if (sfwordnr < 0) 6247 { 6248 internal_error("add_sound_suggest()"); 6249 return; 6250 } 6251 6252 /* 6253 * go over the list of good words that produce this soundfold word 6254 */ 6255 nrline = ml_get_buf(slang->sl_sugbuf, (linenr_T)(sfwordnr + 1), FALSE); 6256 orgnr = 0; 6257 while (*nrline != NUL) 6258 { 6259 /* The wordnr was stored in a minimal nr of bytes as an offset to the 6260 * previous wordnr. */ 6261 orgnr += bytes2offset(&nrline); 6262 6263 byts = slang->sl_fbyts; 6264 idxs = slang->sl_fidxs; 6265 6266 /* Lookup the word "orgnr" one of the two tries. */ 6267 n = 0; 6268 wordcount = 0; 6269 for (wlen = 0; wlen < MAXWLEN - 3; ++wlen) 6270 { 6271 i = 1; 6272 if (wordcount == orgnr && byts[n + 1] == NUL) 6273 break; /* found end of word */ 6274 6275 if (byts[n + 1] == NUL) 6276 ++wordcount; 6277 6278 /* skip over the NUL bytes */ 6279 for ( ; byts[n + i] == NUL; ++i) 6280 if (i > byts[n]) /* safety check */ 6281 { 6282 STRCPY(theword + wlen, "BAD"); 6283 wlen += 3; 6284 goto badword; 6285 } 6286 6287 /* One of the siblings must have the word. */ 6288 for ( ; i < byts[n]; ++i) 6289 { 6290 wc = idxs[idxs[n + i]]; /* nr of words under this byte */ 6291 if (wordcount + wc > orgnr) 6292 break; 6293 wordcount += wc; 6294 } 6295 6296 theword[wlen] = byts[n + i]; 6297 n = idxs[n + i]; 6298 } 6299 badword: 6300 theword[wlen] = NUL; 6301 6302 /* Go over the possible flags and regions. */ 6303 for (; i <= byts[n] && byts[n + i] == NUL; ++i) 6304 { 6305 char_u cword[MAXWLEN]; 6306 char_u *p; 6307 int flags = (int)idxs[n + i]; 6308 6309 /* Skip words with the NOSUGGEST flag */ 6310 if (flags & WF_NOSUGGEST) 6311 continue; 6312 6313 if (flags & WF_KEEPCAP) 6314 { 6315 /* Must find the word in the keep-case tree. */ 6316 find_keepcap_word(slang, theword, cword); 6317 p = cword; 6318 } 6319 else 6320 { 6321 flags |= su->su_badflags; 6322 if ((flags & WF_CAPMASK) != 0) 6323 { 6324 /* Need to fix case according to "flags". */ 6325 make_case_word(theword, cword, flags); 6326 p = cword; 6327 } 6328 else 6329 p = theword; 6330 } 6331 6332 /* Add the suggestion. */ 6333 if (sps_flags & SPS_DOUBLE) 6334 { 6335 /* Add the suggestion if the score isn't too bad. */ 6336 if (score <= su->su_maxscore) 6337 add_suggestion(su, &su->su_sga, p, su->su_badlen, 6338 score, 0, FALSE, slang, FALSE); 6339 } 6340 else 6341 { 6342 /* Add a penalty for words in another region. */ 6343 if ((flags & WF_REGION) 6344 && (((unsigned)flags >> 16) & lp->lp_region) == 0) 6345 goodscore = SCORE_REGION; 6346 else 6347 goodscore = 0; 6348 6349 /* Add a small penalty for changing the first letter from 6350 * lower to upper case. Helps for "tath" -> "Kath", which is 6351 * less common than "tath" -> "path". Don't do it when the 6352 * letter is the same, that has already been counted. */ 6353 gc = PTR2CHAR(p); 6354 if (SPELL_ISUPPER(gc)) 6355 { 6356 bc = PTR2CHAR(su->su_badword); 6357 if (!SPELL_ISUPPER(bc) 6358 && SPELL_TOFOLD(bc) != SPELL_TOFOLD(gc)) 6359 goodscore += SCORE_ICASE / 2; 6360 } 6361 6362 /* Compute the score for the good word. This only does letter 6363 * insert/delete/swap/replace. REP items are not considered, 6364 * which may make the score a bit higher. 6365 * Use a limit for the score to make it work faster. Use 6366 * MAXSCORE(), because RESCORE() will change the score. 6367 * If the limit is very high then the iterative method is 6368 * inefficient, using an array is quicker. */ 6369 limit = MAXSCORE(su->su_sfmaxscore - goodscore, score); 6370 if (limit > SCORE_LIMITMAX) 6371 goodscore += spell_edit_score(slang, su->su_badword, p); 6372 else 6373 goodscore += spell_edit_score_limit(slang, su->su_badword, 6374 p, limit); 6375 6376 /* When going over the limit don't bother to do the rest. */ 6377 if (goodscore < SCORE_MAXMAX) 6378 { 6379 /* Give a bonus to words seen before. */ 6380 goodscore = score_wordcount_adj(slang, goodscore, p, FALSE); 6381 6382 /* Add the suggestion if the score isn't too bad. */ 6383 goodscore = RESCORE(goodscore, score); 6384 if (goodscore <= su->su_sfmaxscore) 6385 add_suggestion(su, &su->su_ga, p, su->su_badlen, 6386 goodscore, score, TRUE, slang, TRUE); 6387 } 6388 } 6389 } 6390 /* smsg("word %s (%d): %s (%d)", sftword, sftnr, theword, orgnr); */ 6391 } 6392 } 6393 6394 /* 6395 * Find word "word" in fold-case tree for "slang" and return the word number. 6396 */ 6397 static int 6398 soundfold_find(slang_T *slang, char_u *word) 6399 { 6400 idx_T arridx = 0; 6401 int len; 6402 int wlen = 0; 6403 int c; 6404 char_u *ptr = word; 6405 char_u *byts; 6406 idx_T *idxs; 6407 int wordnr = 0; 6408 6409 byts = slang->sl_sbyts; 6410 idxs = slang->sl_sidxs; 6411 6412 for (;;) 6413 { 6414 /* First byte is the number of possible bytes. */ 6415 len = byts[arridx++]; 6416 6417 /* If the first possible byte is a zero the word could end here. 6418 * If the word ends we found the word. If not skip the NUL bytes. */ 6419 c = ptr[wlen]; 6420 if (byts[arridx] == NUL) 6421 { 6422 if (c == NUL) 6423 break; 6424 6425 /* Skip over the zeros, there can be several. */ 6426 while (len > 0 && byts[arridx] == NUL) 6427 { 6428 ++arridx; 6429 --len; 6430 } 6431 if (len == 0) 6432 return -1; /* no children, word should have ended here */ 6433 ++wordnr; 6434 } 6435 6436 /* If the word ends we didn't find it. */ 6437 if (c == NUL) 6438 return -1; 6439 6440 /* Perform a binary search in the list of accepted bytes. */ 6441 if (c == TAB) /* <Tab> is handled like <Space> */ 6442 c = ' '; 6443 while (byts[arridx] < c) 6444 { 6445 /* The word count is in the first idxs[] entry of the child. */ 6446 wordnr += idxs[idxs[arridx]]; 6447 ++arridx; 6448 if (--len == 0) /* end of the bytes, didn't find it */ 6449 return -1; 6450 } 6451 if (byts[arridx] != c) /* didn't find the byte */ 6452 return -1; 6453 6454 /* Continue at the child (if there is one). */ 6455 arridx = idxs[arridx]; 6456 ++wlen; 6457 6458 /* One space in the good word may stand for several spaces in the 6459 * checked word. */ 6460 if (c == ' ') 6461 while (ptr[wlen] == ' ' || ptr[wlen] == TAB) 6462 ++wlen; 6463 } 6464 6465 return wordnr; 6466 } 6467 6468 /* 6469 * Copy "fword" to "cword", fixing case according to "flags". 6470 */ 6471 static void 6472 make_case_word(char_u *fword, char_u *cword, int flags) 6473 { 6474 if (flags & WF_ALLCAP) 6475 /* Make it all upper-case */ 6476 allcap_copy(fword, cword); 6477 else if (flags & WF_ONECAP) 6478 /* Make the first letter upper-case */ 6479 onecap_copy(fword, cword, TRUE); 6480 else 6481 /* Use goodword as-is. */ 6482 STRCPY(cword, fword); 6483 } 6484 6485 6486 /* 6487 * Return TRUE if "c1" and "c2" are similar characters according to the MAP 6488 * lines in the .aff file. 6489 */ 6490 static int 6491 similar_chars(slang_T *slang, int c1, int c2) 6492 { 6493 int m1, m2; 6494 char_u buf[MB_MAXBYTES + 1]; 6495 hashitem_T *hi; 6496 6497 if (c1 >= 256) 6498 { 6499 buf[mb_char2bytes(c1, buf)] = 0; 6500 hi = hash_find(&slang->sl_map_hash, buf); 6501 if (HASHITEM_EMPTY(hi)) 6502 m1 = 0; 6503 else 6504 m1 = mb_ptr2char(hi->hi_key + STRLEN(hi->hi_key) + 1); 6505 } 6506 else 6507 m1 = slang->sl_map_array[c1]; 6508 if (m1 == 0) 6509 return FALSE; 6510 6511 6512 if (c2 >= 256) 6513 { 6514 buf[mb_char2bytes(c2, buf)] = 0; 6515 hi = hash_find(&slang->sl_map_hash, buf); 6516 if (HASHITEM_EMPTY(hi)) 6517 m2 = 0; 6518 else 6519 m2 = mb_ptr2char(hi->hi_key + STRLEN(hi->hi_key) + 1); 6520 } 6521 else 6522 m2 = slang->sl_map_array[c2]; 6523 6524 return m1 == m2; 6525 } 6526 6527 /* 6528 * Add a suggestion to the list of suggestions. 6529 * For a suggestion that is already in the list the lowest score is remembered. 6530 */ 6531 static void 6532 add_suggestion( 6533 suginfo_T *su, 6534 garray_T *gap, /* either su_ga or su_sga */ 6535 char_u *goodword, 6536 int badlenarg, /* len of bad word replaced with "goodword" */ 6537 int score, 6538 int altscore, 6539 int had_bonus, /* value for st_had_bonus */ 6540 slang_T *slang, /* language for sound folding */ 6541 int maxsf) /* su_maxscore applies to soundfold score, 6542 su_sfmaxscore to the total score. */ 6543 { 6544 int goodlen; /* len of goodword changed */ 6545 int badlen; /* len of bad word changed */ 6546 suggest_T *stp; 6547 suggest_T new_sug; 6548 int i; 6549 char_u *pgood, *pbad; 6550 6551 /* Minimize "badlen" for consistency. Avoids that changing "the the" to 6552 * "thee the" is added next to changing the first "the" the "thee". */ 6553 pgood = goodword + STRLEN(goodword); 6554 pbad = su->su_badptr + badlenarg; 6555 for (;;) 6556 { 6557 goodlen = (int)(pgood - goodword); 6558 badlen = (int)(pbad - su->su_badptr); 6559 if (goodlen <= 0 || badlen <= 0) 6560 break; 6561 MB_PTR_BACK(goodword, pgood); 6562 MB_PTR_BACK(su->su_badptr, pbad); 6563 if (has_mbyte) 6564 { 6565 if (mb_ptr2char(pgood) != mb_ptr2char(pbad)) 6566 break; 6567 } 6568 else if (*pgood != *pbad) 6569 break; 6570 } 6571 6572 if (badlen == 0 && goodlen == 0) 6573 /* goodword doesn't change anything; may happen for "the the" changing 6574 * the first "the" to itself. */ 6575 return; 6576 6577 if (gap->ga_len == 0) 6578 i = -1; 6579 else 6580 { 6581 /* Check if the word is already there. Also check the length that is 6582 * being replaced "thes," -> "these" is a different suggestion from 6583 * "thes" -> "these". */ 6584 stp = &SUG(*gap, 0); 6585 for (i = gap->ga_len; --i >= 0; ++stp) 6586 if (stp->st_wordlen == goodlen 6587 && stp->st_orglen == badlen 6588 && STRNCMP(stp->st_word, goodword, goodlen) == 0) 6589 { 6590 /* 6591 * Found it. Remember the word with the lowest score. 6592 */ 6593 if (stp->st_slang == NULL) 6594 stp->st_slang = slang; 6595 6596 new_sug.st_score = score; 6597 new_sug.st_altscore = altscore; 6598 new_sug.st_had_bonus = had_bonus; 6599 6600 if (stp->st_had_bonus != had_bonus) 6601 { 6602 /* Only one of the two had the soundalike score computed. 6603 * Need to do that for the other one now, otherwise the 6604 * scores can't be compared. This happens because 6605 * suggest_try_change() doesn't compute the soundalike 6606 * word to keep it fast, while some special methods set 6607 * the soundalike score to zero. */ 6608 if (had_bonus) 6609 rescore_one(su, stp); 6610 else 6611 { 6612 new_sug.st_word = stp->st_word; 6613 new_sug.st_wordlen = stp->st_wordlen; 6614 new_sug.st_slang = stp->st_slang; 6615 new_sug.st_orglen = badlen; 6616 rescore_one(su, &new_sug); 6617 } 6618 } 6619 6620 if (stp->st_score > new_sug.st_score) 6621 { 6622 stp->st_score = new_sug.st_score; 6623 stp->st_altscore = new_sug.st_altscore; 6624 stp->st_had_bonus = new_sug.st_had_bonus; 6625 } 6626 break; 6627 } 6628 } 6629 6630 if (i < 0 && ga_grow(gap, 1) == OK) 6631 { 6632 /* Add a suggestion. */ 6633 stp = &SUG(*gap, gap->ga_len); 6634 stp->st_word = vim_strnsave(goodword, goodlen); 6635 if (stp->st_word != NULL) 6636 { 6637 stp->st_wordlen = goodlen; 6638 stp->st_score = score; 6639 stp->st_altscore = altscore; 6640 stp->st_had_bonus = had_bonus; 6641 stp->st_orglen = badlen; 6642 stp->st_slang = slang; 6643 ++gap->ga_len; 6644 6645 /* If we have too many suggestions now, sort the list and keep 6646 * the best suggestions. */ 6647 if (gap->ga_len > SUG_MAX_COUNT(su)) 6648 { 6649 if (maxsf) 6650 su->su_sfmaxscore = cleanup_suggestions(gap, 6651 su->su_sfmaxscore, SUG_CLEAN_COUNT(su)); 6652 else 6653 su->su_maxscore = cleanup_suggestions(gap, 6654 su->su_maxscore, SUG_CLEAN_COUNT(su)); 6655 } 6656 } 6657 } 6658 } 6659 6660 /* 6661 * Suggestions may in fact be flagged as errors. Esp. for banned words and 6662 * for split words, such as "the the". Remove these from the list here. 6663 */ 6664 static void 6665 check_suggestions( 6666 suginfo_T *su, 6667 garray_T *gap) /* either su_ga or su_sga */ 6668 { 6669 suggest_T *stp; 6670 int i; 6671 char_u longword[MAXWLEN + 1]; 6672 int len; 6673 hlf_T attr; 6674 6675 stp = &SUG(*gap, 0); 6676 for (i = gap->ga_len - 1; i >= 0; --i) 6677 { 6678 /* Need to append what follows to check for "the the". */ 6679 vim_strncpy(longword, stp[i].st_word, MAXWLEN); 6680 len = stp[i].st_wordlen; 6681 vim_strncpy(longword + len, su->su_badptr + stp[i].st_orglen, 6682 MAXWLEN - len); 6683 attr = HLF_COUNT; 6684 (void)spell_check(curwin, longword, &attr, NULL, FALSE); 6685 if (attr != HLF_COUNT) 6686 { 6687 /* Remove this entry. */ 6688 vim_free(stp[i].st_word); 6689 --gap->ga_len; 6690 if (i < gap->ga_len) 6691 mch_memmove(stp + i, stp + i + 1, 6692 sizeof(suggest_T) * (gap->ga_len - i)); 6693 } 6694 } 6695 } 6696 6697 6698 /* 6699 * Add a word to be banned. 6700 */ 6701 static void 6702 add_banned( 6703 suginfo_T *su, 6704 char_u *word) 6705 { 6706 char_u *s; 6707 hash_T hash; 6708 hashitem_T *hi; 6709 6710 hash = hash_hash(word); 6711 hi = hash_lookup(&su->su_banned, word, hash); 6712 if (HASHITEM_EMPTY(hi)) 6713 { 6714 s = vim_strsave(word); 6715 if (s != NULL) 6716 hash_add_item(&su->su_banned, hi, s, hash); 6717 } 6718 } 6719 6720 /* 6721 * Recompute the score for all suggestions if sound-folding is possible. This 6722 * is slow, thus only done for the final results. 6723 */ 6724 static void 6725 rescore_suggestions(suginfo_T *su) 6726 { 6727 int i; 6728 6729 if (su->su_sallang != NULL) 6730 for (i = 0; i < su->su_ga.ga_len; ++i) 6731 rescore_one(su, &SUG(su->su_ga, i)); 6732 } 6733 6734 /* 6735 * Recompute the score for one suggestion if sound-folding is possible. 6736 */ 6737 static void 6738 rescore_one(suginfo_T *su, suggest_T *stp) 6739 { 6740 slang_T *slang = stp->st_slang; 6741 char_u sal_badword[MAXWLEN]; 6742 char_u *p; 6743 6744 /* Only rescore suggestions that have no sal score yet and do have a 6745 * language. */ 6746 if (slang != NULL && slang->sl_sal.ga_len > 0 && !stp->st_had_bonus) 6747 { 6748 if (slang == su->su_sallang) 6749 p = su->su_sal_badword; 6750 else 6751 { 6752 spell_soundfold(slang, su->su_fbadword, TRUE, sal_badword); 6753 p = sal_badword; 6754 } 6755 6756 stp->st_altscore = stp_sal_score(stp, su, slang, p); 6757 if (stp->st_altscore == SCORE_MAXMAX) 6758 stp->st_altscore = SCORE_BIG; 6759 stp->st_score = RESCORE(stp->st_score, stp->st_altscore); 6760 stp->st_had_bonus = TRUE; 6761 } 6762 } 6763 6764 static int sug_compare(const void *s1, const void *s2); 6765 6766 /* 6767 * Function given to qsort() to sort the suggestions on st_score. 6768 * First on "st_score", then "st_altscore" then alphabetically. 6769 */ 6770 static int 6771 sug_compare(const void *s1, const void *s2) 6772 { 6773 suggest_T *p1 = (suggest_T *)s1; 6774 suggest_T *p2 = (suggest_T *)s2; 6775 int n = p1->st_score - p2->st_score; 6776 6777 if (n == 0) 6778 { 6779 n = p1->st_altscore - p2->st_altscore; 6780 if (n == 0) 6781 n = STRICMP(p1->st_word, p2->st_word); 6782 } 6783 return n; 6784 } 6785 6786 /* 6787 * Cleanup the suggestions: 6788 * - Sort on score. 6789 * - Remove words that won't be displayed. 6790 * Returns the maximum score in the list or "maxscore" unmodified. 6791 */ 6792 static int 6793 cleanup_suggestions( 6794 garray_T *gap, 6795 int maxscore, 6796 int keep) /* nr of suggestions to keep */ 6797 { 6798 suggest_T *stp = &SUG(*gap, 0); 6799 int i; 6800 6801 /* Sort the list. */ 6802 qsort(gap->ga_data, (size_t)gap->ga_len, sizeof(suggest_T), sug_compare); 6803 6804 /* Truncate the list to the number of suggestions that will be displayed. */ 6805 if (gap->ga_len > keep) 6806 { 6807 for (i = keep; i < gap->ga_len; ++i) 6808 vim_free(stp[i].st_word); 6809 gap->ga_len = keep; 6810 return stp[keep - 1].st_score; 6811 } 6812 return maxscore; 6813 } 6814 6815 #if defined(FEAT_EVAL) || defined(PROTO) 6816 /* 6817 * Soundfold a string, for soundfold(). 6818 * Result is in allocated memory, NULL for an error. 6819 */ 6820 char_u * 6821 eval_soundfold(char_u *word) 6822 { 6823 langp_T *lp; 6824 char_u sound[MAXWLEN]; 6825 int lpi; 6826 6827 if (curwin->w_p_spell && *curwin->w_s->b_p_spl != NUL) 6828 /* Use the sound-folding of the first language that supports it. */ 6829 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 6830 { 6831 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 6832 if (lp->lp_slang->sl_sal.ga_len > 0) 6833 { 6834 /* soundfold the word */ 6835 spell_soundfold(lp->lp_slang, word, FALSE, sound); 6836 return vim_strsave(sound); 6837 } 6838 } 6839 6840 /* No language with sound folding, return word as-is. */ 6841 return vim_strsave(word); 6842 } 6843 #endif 6844 6845 /* 6846 * Turn "inword" into its sound-a-like equivalent in "res[MAXWLEN]". 6847 * 6848 * There are many ways to turn a word into a sound-a-like representation. The 6849 * oldest is Soundex (1918!). A nice overview can be found in "Approximate 6850 * swedish name matching - survey and test of different algorithms" by Klas 6851 * Erikson. 6852 * 6853 * We support two methods: 6854 * 1. SOFOFROM/SOFOTO do a simple character mapping. 6855 * 2. SAL items define a more advanced sound-folding (and much slower). 6856 */ 6857 void 6858 spell_soundfold( 6859 slang_T *slang, 6860 char_u *inword, 6861 int folded, /* "inword" is already case-folded */ 6862 char_u *res) 6863 { 6864 char_u fword[MAXWLEN]; 6865 char_u *word; 6866 6867 if (slang->sl_sofo) 6868 /* SOFOFROM and SOFOTO used */ 6869 spell_soundfold_sofo(slang, inword, res); 6870 else 6871 { 6872 /* SAL items used. Requires the word to be case-folded. */ 6873 if (folded) 6874 word = inword; 6875 else 6876 { 6877 (void)spell_casefold(inword, (int)STRLEN(inword), fword, MAXWLEN); 6878 word = fword; 6879 } 6880 6881 if (has_mbyte) 6882 spell_soundfold_wsal(slang, word, res); 6883 else 6884 spell_soundfold_sal(slang, word, res); 6885 } 6886 } 6887 6888 /* 6889 * Perform sound folding of "inword" into "res" according to SOFOFROM and 6890 * SOFOTO lines. 6891 */ 6892 static void 6893 spell_soundfold_sofo(slang_T *slang, char_u *inword, char_u *res) 6894 { 6895 char_u *s; 6896 int ri = 0; 6897 int c; 6898 6899 if (has_mbyte) 6900 { 6901 int prevc = 0; 6902 int *ip; 6903 6904 /* The sl_sal_first[] table contains the translation for chars up to 6905 * 255, sl_sal the rest. */ 6906 for (s = inword; *s != NUL; ) 6907 { 6908 c = mb_cptr2char_adv(&s); 6909 if (enc_utf8 ? utf_class(c) == 0 : VIM_ISWHITE(c)) 6910 c = ' '; 6911 else if (c < 256) 6912 c = slang->sl_sal_first[c]; 6913 else 6914 { 6915 ip = ((int **)slang->sl_sal.ga_data)[c & 0xff]; 6916 if (ip == NULL) /* empty list, can't match */ 6917 c = NUL; 6918 else 6919 for (;;) /* find "c" in the list */ 6920 { 6921 if (*ip == 0) /* not found */ 6922 { 6923 c = NUL; 6924 break; 6925 } 6926 if (*ip == c) /* match! */ 6927 { 6928 c = ip[1]; 6929 break; 6930 } 6931 ip += 2; 6932 } 6933 } 6934 6935 if (c != NUL && c != prevc) 6936 { 6937 ri += mb_char2bytes(c, res + ri); 6938 if (ri + MB_MAXBYTES > MAXWLEN) 6939 break; 6940 prevc = c; 6941 } 6942 } 6943 } 6944 else 6945 { 6946 /* The sl_sal_first[] table contains the translation. */ 6947 for (s = inword; (c = *s) != NUL; ++s) 6948 { 6949 if (VIM_ISWHITE(c)) 6950 c = ' '; 6951 else 6952 c = slang->sl_sal_first[c]; 6953 if (c != NUL && (ri == 0 || res[ri - 1] != c)) 6954 res[ri++] = c; 6955 } 6956 } 6957 6958 res[ri] = NUL; 6959 } 6960 6961 static void 6962 spell_soundfold_sal(slang_T *slang, char_u *inword, char_u *res) 6963 { 6964 salitem_T *smp; 6965 char_u word[MAXWLEN]; 6966 char_u *s = inword; 6967 char_u *t; 6968 char_u *pf; 6969 int i, j, z; 6970 int reslen; 6971 int n, k = 0; 6972 int z0; 6973 int k0; 6974 int n0; 6975 int c; 6976 int pri; 6977 int p0 = -333; 6978 int c0; 6979 6980 /* Remove accents, if wanted. We actually remove all non-word characters. 6981 * But keep white space. We need a copy, the word may be changed here. */ 6982 if (slang->sl_rem_accents) 6983 { 6984 t = word; 6985 while (*s != NUL) 6986 { 6987 if (VIM_ISWHITE(*s)) 6988 { 6989 *t++ = ' '; 6990 s = skipwhite(s); 6991 } 6992 else 6993 { 6994 if (spell_iswordp_nmw(s, curwin)) 6995 *t++ = *s; 6996 ++s; 6997 } 6998 } 6999 *t = NUL; 7000 } 7001 else 7002 vim_strncpy(word, s, MAXWLEN - 1); 7003 7004 smp = (salitem_T *)slang->sl_sal.ga_data; 7005 7006 /* 7007 * This comes from Aspell phonet.cpp. Converted from C++ to C. 7008 * Changed to keep spaces. 7009 */ 7010 i = reslen = z = 0; 7011 while ((c = word[i]) != NUL) 7012 { 7013 /* Start with the first rule that has the character in the word. */ 7014 n = slang->sl_sal_first[c]; 7015 z0 = 0; 7016 7017 if (n >= 0) 7018 { 7019 /* check all rules for the same letter */ 7020 for (; (s = smp[n].sm_lead)[0] == c; ++n) 7021 { 7022 /* Quickly skip entries that don't match the word. Most 7023 * entries are less then three chars, optimize for that. */ 7024 k = smp[n].sm_leadlen; 7025 if (k > 1) 7026 { 7027 if (word[i + 1] != s[1]) 7028 continue; 7029 if (k > 2) 7030 { 7031 for (j = 2; j < k; ++j) 7032 if (word[i + j] != s[j]) 7033 break; 7034 if (j < k) 7035 continue; 7036 } 7037 } 7038 7039 if ((pf = smp[n].sm_oneof) != NULL) 7040 { 7041 /* Check for match with one of the chars in "sm_oneof". */ 7042 while (*pf != NUL && *pf != word[i + k]) 7043 ++pf; 7044 if (*pf == NUL) 7045 continue; 7046 ++k; 7047 } 7048 s = smp[n].sm_rules; 7049 pri = 5; /* default priority */ 7050 7051 p0 = *s; 7052 k0 = k; 7053 while (*s == '-' && k > 1) 7054 { 7055 k--; 7056 s++; 7057 } 7058 if (*s == '<') 7059 s++; 7060 if (VIM_ISDIGIT(*s)) 7061 { 7062 /* determine priority */ 7063 pri = *s - '0'; 7064 s++; 7065 } 7066 if (*s == '^' && *(s + 1) == '^') 7067 s++; 7068 7069 if (*s == NUL 7070 || (*s == '^' 7071 && (i == 0 || !(word[i - 1] == ' ' 7072 || spell_iswordp(word + i - 1, curwin))) 7073 && (*(s + 1) != '$' 7074 || (!spell_iswordp(word + i + k0, curwin)))) 7075 || (*s == '$' && i > 0 7076 && spell_iswordp(word + i - 1, curwin) 7077 && (!spell_iswordp(word + i + k0, curwin)))) 7078 { 7079 /* search for followup rules, if: */ 7080 /* followup and k > 1 and NO '-' in searchstring */ 7081 c0 = word[i + k - 1]; 7082 n0 = slang->sl_sal_first[c0]; 7083 7084 if (slang->sl_followup && k > 1 && n0 >= 0 7085 && p0 != '-' && word[i + k] != NUL) 7086 { 7087 /* test follow-up rule for "word[i + k]" */ 7088 for ( ; (s = smp[n0].sm_lead)[0] == c0; ++n0) 7089 { 7090 /* Quickly skip entries that don't match the word. 7091 * */ 7092 k0 = smp[n0].sm_leadlen; 7093 if (k0 > 1) 7094 { 7095 if (word[i + k] != s[1]) 7096 continue; 7097 if (k0 > 2) 7098 { 7099 pf = word + i + k + 1; 7100 for (j = 2; j < k0; ++j) 7101 if (*pf++ != s[j]) 7102 break; 7103 if (j < k0) 7104 continue; 7105 } 7106 } 7107 k0 += k - 1; 7108 7109 if ((pf = smp[n0].sm_oneof) != NULL) 7110 { 7111 /* Check for match with one of the chars in 7112 * "sm_oneof". */ 7113 while (*pf != NUL && *pf != word[i + k0]) 7114 ++pf; 7115 if (*pf == NUL) 7116 continue; 7117 ++k0; 7118 } 7119 7120 p0 = 5; 7121 s = smp[n0].sm_rules; 7122 while (*s == '-') 7123 { 7124 /* "k0" gets NOT reduced because 7125 * "if (k0 == k)" */ 7126 s++; 7127 } 7128 if (*s == '<') 7129 s++; 7130 if (VIM_ISDIGIT(*s)) 7131 { 7132 p0 = *s - '0'; 7133 s++; 7134 } 7135 7136 if (*s == NUL 7137 /* *s == '^' cuts */ 7138 || (*s == '$' 7139 && !spell_iswordp(word + i + k0, 7140 curwin))) 7141 { 7142 if (k0 == k) 7143 /* this is just a piece of the string */ 7144 continue; 7145 7146 if (p0 < pri) 7147 /* priority too low */ 7148 continue; 7149 /* rule fits; stop search */ 7150 break; 7151 } 7152 } 7153 7154 if (p0 >= pri && smp[n0].sm_lead[0] == c0) 7155 continue; 7156 } 7157 7158 /* replace string */ 7159 s = smp[n].sm_to; 7160 if (s == NULL) 7161 s = (char_u *)""; 7162 pf = smp[n].sm_rules; 7163 p0 = (vim_strchr(pf, '<') != NULL) ? 1 : 0; 7164 if (p0 == 1 && z == 0) 7165 { 7166 /* rule with '<' is used */ 7167 if (reslen > 0 && *s != NUL && (res[reslen - 1] == c 7168 || res[reslen - 1] == *s)) 7169 reslen--; 7170 z0 = 1; 7171 z = 1; 7172 k0 = 0; 7173 while (*s != NUL && word[i + k0] != NUL) 7174 { 7175 word[i + k0] = *s; 7176 k0++; 7177 s++; 7178 } 7179 if (k > k0) 7180 STRMOVE(word + i + k0, word + i + k); 7181 7182 /* new "actual letter" */ 7183 c = word[i]; 7184 } 7185 else 7186 { 7187 /* no '<' rule used */ 7188 i += k - 1; 7189 z = 0; 7190 while (*s != NUL && s[1] != NUL && reslen < MAXWLEN) 7191 { 7192 if (reslen == 0 || res[reslen - 1] != *s) 7193 res[reslen++] = *s; 7194 s++; 7195 } 7196 /* new "actual letter" */ 7197 c = *s; 7198 if (strstr((char *)pf, "^^") != NULL) 7199 { 7200 if (c != NUL) 7201 res[reslen++] = c; 7202 STRMOVE(word, word + i + 1); 7203 i = 0; 7204 z0 = 1; 7205 } 7206 } 7207 break; 7208 } 7209 } 7210 } 7211 else if (VIM_ISWHITE(c)) 7212 { 7213 c = ' '; 7214 k = 1; 7215 } 7216 7217 if (z0 == 0) 7218 { 7219 if (k && !p0 && reslen < MAXWLEN && c != NUL 7220 && (!slang->sl_collapse || reslen == 0 7221 || res[reslen - 1] != c)) 7222 /* condense only double letters */ 7223 res[reslen++] = c; 7224 7225 i++; 7226 z = 0; 7227 k = 0; 7228 } 7229 } 7230 7231 res[reslen] = NUL; 7232 } 7233 7234 /* 7235 * Turn "inword" into its sound-a-like equivalent in "res[MAXWLEN]". 7236 * Multi-byte version of spell_soundfold(). 7237 */ 7238 static void 7239 spell_soundfold_wsal(slang_T *slang, char_u *inword, char_u *res) 7240 { 7241 salitem_T *smp = (salitem_T *)slang->sl_sal.ga_data; 7242 int word[MAXWLEN]; 7243 int wres[MAXWLEN]; 7244 int l; 7245 char_u *s; 7246 int *ws; 7247 char_u *t; 7248 int *pf; 7249 int i, j, z; 7250 int reslen; 7251 int n, k = 0; 7252 int z0; 7253 int k0; 7254 int n0; 7255 int c; 7256 int pri; 7257 int p0 = -333; 7258 int c0; 7259 int did_white = FALSE; 7260 int wordlen; 7261 7262 7263 /* 7264 * Convert the multi-byte string to a wide-character string. 7265 * Remove accents, if wanted. We actually remove all non-word characters. 7266 * But keep white space. 7267 */ 7268 wordlen = 0; 7269 for (s = inword; *s != NUL; ) 7270 { 7271 t = s; 7272 c = mb_cptr2char_adv(&s); 7273 if (slang->sl_rem_accents) 7274 { 7275 if (enc_utf8 ? utf_class(c) == 0 : VIM_ISWHITE(c)) 7276 { 7277 if (did_white) 7278 continue; 7279 c = ' '; 7280 did_white = TRUE; 7281 } 7282 else 7283 { 7284 did_white = FALSE; 7285 if (!spell_iswordp_nmw(t, curwin)) 7286 continue; 7287 } 7288 } 7289 word[wordlen++] = c; 7290 } 7291 word[wordlen] = NUL; 7292 7293 /* 7294 * This algorithm comes from Aspell phonet.cpp. 7295 * Converted from C++ to C. Added support for multi-byte chars. 7296 * Changed to keep spaces. 7297 */ 7298 i = reslen = z = 0; 7299 while ((c = word[i]) != NUL) 7300 { 7301 /* Start with the first rule that has the character in the word. */ 7302 n = slang->sl_sal_first[c & 0xff]; 7303 z0 = 0; 7304 7305 if (n >= 0) 7306 { 7307 /* Check all rules for the same index byte. 7308 * If c is 0x300 need extra check for the end of the array, as 7309 * (c & 0xff) is NUL. */ 7310 for (; ((ws = smp[n].sm_lead_w)[0] & 0xff) == (c & 0xff) 7311 && ws[0] != NUL; ++n) 7312 { 7313 /* Quickly skip entries that don't match the word. Most 7314 * entries are less then three chars, optimize for that. */ 7315 if (c != ws[0]) 7316 continue; 7317 k = smp[n].sm_leadlen; 7318 if (k > 1) 7319 { 7320 if (word[i + 1] != ws[1]) 7321 continue; 7322 if (k > 2) 7323 { 7324 for (j = 2; j < k; ++j) 7325 if (word[i + j] != ws[j]) 7326 break; 7327 if (j < k) 7328 continue; 7329 } 7330 } 7331 7332 if ((pf = smp[n].sm_oneof_w) != NULL) 7333 { 7334 /* Check for match with one of the chars in "sm_oneof". */ 7335 while (*pf != NUL && *pf != word[i + k]) 7336 ++pf; 7337 if (*pf == NUL) 7338 continue; 7339 ++k; 7340 } 7341 s = smp[n].sm_rules; 7342 pri = 5; /* default priority */ 7343 7344 p0 = *s; 7345 k0 = k; 7346 while (*s == '-' && k > 1) 7347 { 7348 k--; 7349 s++; 7350 } 7351 if (*s == '<') 7352 s++; 7353 if (VIM_ISDIGIT(*s)) 7354 { 7355 /* determine priority */ 7356 pri = *s - '0'; 7357 s++; 7358 } 7359 if (*s == '^' && *(s + 1) == '^') 7360 s++; 7361 7362 if (*s == NUL 7363 || (*s == '^' 7364 && (i == 0 || !(word[i - 1] == ' ' 7365 || spell_iswordp_w(word + i - 1, curwin))) 7366 && (*(s + 1) != '$' 7367 || (!spell_iswordp_w(word + i + k0, curwin)))) 7368 || (*s == '$' && i > 0 7369 && spell_iswordp_w(word + i - 1, curwin) 7370 && (!spell_iswordp_w(word + i + k0, curwin)))) 7371 { 7372 /* search for followup rules, if: */ 7373 /* followup and k > 1 and NO '-' in searchstring */ 7374 c0 = word[i + k - 1]; 7375 n0 = slang->sl_sal_first[c0 & 0xff]; 7376 7377 if (slang->sl_followup && k > 1 && n0 >= 0 7378 && p0 != '-' && word[i + k] != NUL) 7379 { 7380 /* Test follow-up rule for "word[i + k]"; loop over 7381 * all entries with the same index byte. */ 7382 for ( ; ((ws = smp[n0].sm_lead_w)[0] & 0xff) 7383 == (c0 & 0xff); ++n0) 7384 { 7385 /* Quickly skip entries that don't match the word. 7386 */ 7387 if (c0 != ws[0]) 7388 continue; 7389 k0 = smp[n0].sm_leadlen; 7390 if (k0 > 1) 7391 { 7392 if (word[i + k] != ws[1]) 7393 continue; 7394 if (k0 > 2) 7395 { 7396 pf = word + i + k + 1; 7397 for (j = 2; j < k0; ++j) 7398 if (*pf++ != ws[j]) 7399 break; 7400 if (j < k0) 7401 continue; 7402 } 7403 } 7404 k0 += k - 1; 7405 7406 if ((pf = smp[n0].sm_oneof_w) != NULL) 7407 { 7408 /* Check for match with one of the chars in 7409 * "sm_oneof". */ 7410 while (*pf != NUL && *pf != word[i + k0]) 7411 ++pf; 7412 if (*pf == NUL) 7413 continue; 7414 ++k0; 7415 } 7416 7417 p0 = 5; 7418 s = smp[n0].sm_rules; 7419 while (*s == '-') 7420 { 7421 /* "k0" gets NOT reduced because 7422 * "if (k0 == k)" */ 7423 s++; 7424 } 7425 if (*s == '<') 7426 s++; 7427 if (VIM_ISDIGIT(*s)) 7428 { 7429 p0 = *s - '0'; 7430 s++; 7431 } 7432 7433 if (*s == NUL 7434 /* *s == '^' cuts */ 7435 || (*s == '$' 7436 && !spell_iswordp_w(word + i + k0, 7437 curwin))) 7438 { 7439 if (k0 == k) 7440 /* this is just a piece of the string */ 7441 continue; 7442 7443 if (p0 < pri) 7444 /* priority too low */ 7445 continue; 7446 /* rule fits; stop search */ 7447 break; 7448 } 7449 } 7450 7451 if (p0 >= pri && (smp[n0].sm_lead_w[0] & 0xff) 7452 == (c0 & 0xff)) 7453 continue; 7454 } 7455 7456 /* replace string */ 7457 ws = smp[n].sm_to_w; 7458 s = smp[n].sm_rules; 7459 p0 = (vim_strchr(s, '<') != NULL) ? 1 : 0; 7460 if (p0 == 1 && z == 0) 7461 { 7462 /* rule with '<' is used */ 7463 if (reslen > 0 && ws != NULL && *ws != NUL 7464 && (wres[reslen - 1] == c 7465 || wres[reslen - 1] == *ws)) 7466 reslen--; 7467 z0 = 1; 7468 z = 1; 7469 k0 = 0; 7470 if (ws != NULL) 7471 while (*ws != NUL && word[i + k0] != NUL) 7472 { 7473 word[i + k0] = *ws; 7474 k0++; 7475 ws++; 7476 } 7477 if (k > k0) 7478 mch_memmove(word + i + k0, word + i + k, 7479 sizeof(int) * (wordlen - (i + k) + 1)); 7480 7481 /* new "actual letter" */ 7482 c = word[i]; 7483 } 7484 else 7485 { 7486 /* no '<' rule used */ 7487 i += k - 1; 7488 z = 0; 7489 if (ws != NULL) 7490 while (*ws != NUL && ws[1] != NUL 7491 && reslen < MAXWLEN) 7492 { 7493 if (reslen == 0 || wres[reslen - 1] != *ws) 7494 wres[reslen++] = *ws; 7495 ws++; 7496 } 7497 /* new "actual letter" */ 7498 if (ws == NULL) 7499 c = NUL; 7500 else 7501 c = *ws; 7502 if (strstr((char *)s, "^^") != NULL) 7503 { 7504 if (c != NUL) 7505 wres[reslen++] = c; 7506 mch_memmove(word, word + i + 1, 7507 sizeof(int) * (wordlen - (i + 1) + 1)); 7508 i = 0; 7509 z0 = 1; 7510 } 7511 } 7512 break; 7513 } 7514 } 7515 } 7516 else if (VIM_ISWHITE(c)) 7517 { 7518 c = ' '; 7519 k = 1; 7520 } 7521 7522 if (z0 == 0) 7523 { 7524 if (k && !p0 && reslen < MAXWLEN && c != NUL 7525 && (!slang->sl_collapse || reslen == 0 7526 || wres[reslen - 1] != c)) 7527 /* condense only double letters */ 7528 wres[reslen++] = c; 7529 7530 i++; 7531 z = 0; 7532 k = 0; 7533 } 7534 } 7535 7536 /* Convert wide characters in "wres" to a multi-byte string in "res". */ 7537 l = 0; 7538 for (n = 0; n < reslen; ++n) 7539 { 7540 l += mb_char2bytes(wres[n], res + l); 7541 if (l + MB_MAXBYTES > MAXWLEN) 7542 break; 7543 } 7544 res[l] = NUL; 7545 } 7546 7547 /* 7548 * Compute a score for two sound-a-like words. 7549 * This permits up to two inserts/deletes/swaps/etc. to keep things fast. 7550 * Instead of a generic loop we write out the code. That keeps it fast by 7551 * avoiding checks that will not be possible. 7552 */ 7553 static int 7554 soundalike_score( 7555 char_u *goodstart, /* sound-folded good word */ 7556 char_u *badstart) /* sound-folded bad word */ 7557 { 7558 char_u *goodsound = goodstart; 7559 char_u *badsound = badstart; 7560 int goodlen; 7561 int badlen; 7562 int n; 7563 char_u *pl, *ps; 7564 char_u *pl2, *ps2; 7565 int score = 0; 7566 7567 /* Adding/inserting "*" at the start (word starts with vowel) shouldn't be 7568 * counted so much, vowels halfway the word aren't counted at all. */ 7569 if ((*badsound == '*' || *goodsound == '*') && *badsound != *goodsound) 7570 { 7571 if ((badsound[0] == NUL && goodsound[1] == NUL) 7572 || (goodsound[0] == NUL && badsound[1] == NUL)) 7573 /* changing word with vowel to word without a sound */ 7574 return SCORE_DEL; 7575 if (badsound[0] == NUL || goodsound[0] == NUL) 7576 /* more than two changes */ 7577 return SCORE_MAXMAX; 7578 7579 if (badsound[1] == goodsound[1] 7580 || (badsound[1] != NUL 7581 && goodsound[1] != NUL 7582 && badsound[2] == goodsound[2])) 7583 { 7584 /* handle like a substitute */ 7585 } 7586 else 7587 { 7588 score = 2 * SCORE_DEL / 3; 7589 if (*badsound == '*') 7590 ++badsound; 7591 else 7592 ++goodsound; 7593 } 7594 } 7595 7596 goodlen = (int)STRLEN(goodsound); 7597 badlen = (int)STRLEN(badsound); 7598 7599 /* Return quickly if the lengths are too different to be fixed by two 7600 * changes. */ 7601 n = goodlen - badlen; 7602 if (n < -2 || n > 2) 7603 return SCORE_MAXMAX; 7604 7605 if (n > 0) 7606 { 7607 pl = goodsound; /* goodsound is longest */ 7608 ps = badsound; 7609 } 7610 else 7611 { 7612 pl = badsound; /* badsound is longest */ 7613 ps = goodsound; 7614 } 7615 7616 /* Skip over the identical part. */ 7617 while (*pl == *ps && *pl != NUL) 7618 { 7619 ++pl; 7620 ++ps; 7621 } 7622 7623 switch (n) 7624 { 7625 case -2: 7626 case 2: 7627 /* 7628 * Must delete two characters from "pl". 7629 */ 7630 ++pl; /* first delete */ 7631 while (*pl == *ps) 7632 { 7633 ++pl; 7634 ++ps; 7635 } 7636 /* strings must be equal after second delete */ 7637 if (STRCMP(pl + 1, ps) == 0) 7638 return score + SCORE_DEL * 2; 7639 7640 /* Failed to compare. */ 7641 break; 7642 7643 case -1: 7644 case 1: 7645 /* 7646 * Minimal one delete from "pl" required. 7647 */ 7648 7649 /* 1: delete */ 7650 pl2 = pl + 1; 7651 ps2 = ps; 7652 while (*pl2 == *ps2) 7653 { 7654 if (*pl2 == NUL) /* reached the end */ 7655 return score + SCORE_DEL; 7656 ++pl2; 7657 ++ps2; 7658 } 7659 7660 /* 2: delete then swap, then rest must be equal */ 7661 if (pl2[0] == ps2[1] && pl2[1] == ps2[0] 7662 && STRCMP(pl2 + 2, ps2 + 2) == 0) 7663 return score + SCORE_DEL + SCORE_SWAP; 7664 7665 /* 3: delete then substitute, then the rest must be equal */ 7666 if (STRCMP(pl2 + 1, ps2 + 1) == 0) 7667 return score + SCORE_DEL + SCORE_SUBST; 7668 7669 /* 4: first swap then delete */ 7670 if (pl[0] == ps[1] && pl[1] == ps[0]) 7671 { 7672 pl2 = pl + 2; /* swap, skip two chars */ 7673 ps2 = ps + 2; 7674 while (*pl2 == *ps2) 7675 { 7676 ++pl2; 7677 ++ps2; 7678 } 7679 /* delete a char and then strings must be equal */ 7680 if (STRCMP(pl2 + 1, ps2) == 0) 7681 return score + SCORE_SWAP + SCORE_DEL; 7682 } 7683 7684 /* 5: first substitute then delete */ 7685 pl2 = pl + 1; /* substitute, skip one char */ 7686 ps2 = ps + 1; 7687 while (*pl2 == *ps2) 7688 { 7689 ++pl2; 7690 ++ps2; 7691 } 7692 /* delete a char and then strings must be equal */ 7693 if (STRCMP(pl2 + 1, ps2) == 0) 7694 return score + SCORE_SUBST + SCORE_DEL; 7695 7696 /* Failed to compare. */ 7697 break; 7698 7699 case 0: 7700 /* 7701 * Lengths are equal, thus changes must result in same length: An 7702 * insert is only possible in combination with a delete. 7703 * 1: check if for identical strings 7704 */ 7705 if (*pl == NUL) 7706 return score; 7707 7708 /* 2: swap */ 7709 if (pl[0] == ps[1] && pl[1] == ps[0]) 7710 { 7711 pl2 = pl + 2; /* swap, skip two chars */ 7712 ps2 = ps + 2; 7713 while (*pl2 == *ps2) 7714 { 7715 if (*pl2 == NUL) /* reached the end */ 7716 return score + SCORE_SWAP; 7717 ++pl2; 7718 ++ps2; 7719 } 7720 /* 3: swap and swap again */ 7721 if (pl2[0] == ps2[1] && pl2[1] == ps2[0] 7722 && STRCMP(pl2 + 2, ps2 + 2) == 0) 7723 return score + SCORE_SWAP + SCORE_SWAP; 7724 7725 /* 4: swap and substitute */ 7726 if (STRCMP(pl2 + 1, ps2 + 1) == 0) 7727 return score + SCORE_SWAP + SCORE_SUBST; 7728 } 7729 7730 /* 5: substitute */ 7731 pl2 = pl + 1; 7732 ps2 = ps + 1; 7733 while (*pl2 == *ps2) 7734 { 7735 if (*pl2 == NUL) /* reached the end */ 7736 return score + SCORE_SUBST; 7737 ++pl2; 7738 ++ps2; 7739 } 7740 7741 /* 6: substitute and swap */ 7742 if (pl2[0] == ps2[1] && pl2[1] == ps2[0] 7743 && STRCMP(pl2 + 2, ps2 + 2) == 0) 7744 return score + SCORE_SUBST + SCORE_SWAP; 7745 7746 /* 7: substitute and substitute */ 7747 if (STRCMP(pl2 + 1, ps2 + 1) == 0) 7748 return score + SCORE_SUBST + SCORE_SUBST; 7749 7750 /* 8: insert then delete */ 7751 pl2 = pl; 7752 ps2 = ps + 1; 7753 while (*pl2 == *ps2) 7754 { 7755 ++pl2; 7756 ++ps2; 7757 } 7758 if (STRCMP(pl2 + 1, ps2) == 0) 7759 return score + SCORE_INS + SCORE_DEL; 7760 7761 /* 9: delete then insert */ 7762 pl2 = pl + 1; 7763 ps2 = ps; 7764 while (*pl2 == *ps2) 7765 { 7766 ++pl2; 7767 ++ps2; 7768 } 7769 if (STRCMP(pl2, ps2 + 1) == 0) 7770 return score + SCORE_INS + SCORE_DEL; 7771 7772 /* Failed to compare. */ 7773 break; 7774 } 7775 7776 return SCORE_MAXMAX; 7777 } 7778 7779 /* 7780 * Compute the "edit distance" to turn "badword" into "goodword". The less 7781 * deletes/inserts/substitutes/swaps are required the lower the score. 7782 * 7783 * The algorithm is described by Du and Chang, 1992. 7784 * The implementation of the algorithm comes from Aspell editdist.cpp, 7785 * edit_distance(). It has been converted from C++ to C and modified to 7786 * support multi-byte characters. 7787 */ 7788 static int 7789 spell_edit_score( 7790 slang_T *slang, 7791 char_u *badword, 7792 char_u *goodword) 7793 { 7794 int *cnt; 7795 int badlen, goodlen; /* lengths including NUL */ 7796 int j, i; 7797 int t; 7798 int bc, gc; 7799 int pbc, pgc; 7800 char_u *p; 7801 int wbadword[MAXWLEN]; 7802 int wgoodword[MAXWLEN]; 7803 7804 if (has_mbyte) 7805 { 7806 /* Get the characters from the multi-byte strings and put them in an 7807 * int array for easy access. */ 7808 for (p = badword, badlen = 0; *p != NUL; ) 7809 wbadword[badlen++] = mb_cptr2char_adv(&p); 7810 wbadword[badlen++] = 0; 7811 for (p = goodword, goodlen = 0; *p != NUL; ) 7812 wgoodword[goodlen++] = mb_cptr2char_adv(&p); 7813 wgoodword[goodlen++] = 0; 7814 } 7815 else 7816 { 7817 badlen = (int)STRLEN(badword) + 1; 7818 goodlen = (int)STRLEN(goodword) + 1; 7819 } 7820 7821 /* We use "cnt" as an array: CNT(badword_idx, goodword_idx). */ 7822 #define CNT(a, b) cnt[(a) + (b) * (badlen + 1)] 7823 cnt = ALLOC_MULT(int, (badlen + 1) * (goodlen + 1)); 7824 if (cnt == NULL) 7825 return 0; /* out of memory */ 7826 7827 CNT(0, 0) = 0; 7828 for (j = 1; j <= goodlen; ++j) 7829 CNT(0, j) = CNT(0, j - 1) + SCORE_INS; 7830 7831 for (i = 1; i <= badlen; ++i) 7832 { 7833 CNT(i, 0) = CNT(i - 1, 0) + SCORE_DEL; 7834 for (j = 1; j <= goodlen; ++j) 7835 { 7836 if (has_mbyte) 7837 { 7838 bc = wbadword[i - 1]; 7839 gc = wgoodword[j - 1]; 7840 } 7841 else 7842 { 7843 bc = badword[i - 1]; 7844 gc = goodword[j - 1]; 7845 } 7846 if (bc == gc) 7847 CNT(i, j) = CNT(i - 1, j - 1); 7848 else 7849 { 7850 /* Use a better score when there is only a case difference. */ 7851 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc)) 7852 CNT(i, j) = SCORE_ICASE + CNT(i - 1, j - 1); 7853 else 7854 { 7855 /* For a similar character use SCORE_SIMILAR. */ 7856 if (slang != NULL 7857 && slang->sl_has_map 7858 && similar_chars(slang, gc, bc)) 7859 CNT(i, j) = SCORE_SIMILAR + CNT(i - 1, j - 1); 7860 else 7861 CNT(i, j) = SCORE_SUBST + CNT(i - 1, j - 1); 7862 } 7863 7864 if (i > 1 && j > 1) 7865 { 7866 if (has_mbyte) 7867 { 7868 pbc = wbadword[i - 2]; 7869 pgc = wgoodword[j - 2]; 7870 } 7871 else 7872 { 7873 pbc = badword[i - 2]; 7874 pgc = goodword[j - 2]; 7875 } 7876 if (bc == pgc && pbc == gc) 7877 { 7878 t = SCORE_SWAP + CNT(i - 2, j - 2); 7879 if (t < CNT(i, j)) 7880 CNT(i, j) = t; 7881 } 7882 } 7883 t = SCORE_DEL + CNT(i - 1, j); 7884 if (t < CNT(i, j)) 7885 CNT(i, j) = t; 7886 t = SCORE_INS + CNT(i, j - 1); 7887 if (t < CNT(i, j)) 7888 CNT(i, j) = t; 7889 } 7890 } 7891 } 7892 7893 i = CNT(badlen - 1, goodlen - 1); 7894 vim_free(cnt); 7895 return i; 7896 } 7897 7898 typedef struct 7899 { 7900 int badi; 7901 int goodi; 7902 int score; 7903 } limitscore_T; 7904 7905 /* 7906 * Like spell_edit_score(), but with a limit on the score to make it faster. 7907 * May return SCORE_MAXMAX when the score is higher than "limit". 7908 * 7909 * This uses a stack for the edits still to be tried. 7910 * The idea comes from Aspell leditdist.cpp. Rewritten in C and added support 7911 * for multi-byte characters. 7912 */ 7913 static int 7914 spell_edit_score_limit( 7915 slang_T *slang, 7916 char_u *badword, 7917 char_u *goodword, 7918 int limit) 7919 { 7920 limitscore_T stack[10]; /* allow for over 3 * 2 edits */ 7921 int stackidx; 7922 int bi, gi; 7923 int bi2, gi2; 7924 int bc, gc; 7925 int score; 7926 int score_off; 7927 int minscore; 7928 int round; 7929 7930 /* Multi-byte characters require a bit more work, use a different function 7931 * to avoid testing "has_mbyte" quite often. */ 7932 if (has_mbyte) 7933 return spell_edit_score_limit_w(slang, badword, goodword, limit); 7934 7935 /* 7936 * The idea is to go from start to end over the words. So long as 7937 * characters are equal just continue, this always gives the lowest score. 7938 * When there is a difference try several alternatives. Each alternative 7939 * increases "score" for the edit distance. Some of the alternatives are 7940 * pushed unto a stack and tried later, some are tried right away. At the 7941 * end of the word the score for one alternative is known. The lowest 7942 * possible score is stored in "minscore". 7943 */ 7944 stackidx = 0; 7945 bi = 0; 7946 gi = 0; 7947 score = 0; 7948 minscore = limit + 1; 7949 7950 for (;;) 7951 { 7952 /* Skip over an equal part, score remains the same. */ 7953 for (;;) 7954 { 7955 bc = badword[bi]; 7956 gc = goodword[gi]; 7957 if (bc != gc) /* stop at a char that's different */ 7958 break; 7959 if (bc == NUL) /* both words end */ 7960 { 7961 if (score < minscore) 7962 minscore = score; 7963 goto pop; /* do next alternative */ 7964 } 7965 ++bi; 7966 ++gi; 7967 } 7968 7969 if (gc == NUL) /* goodword ends, delete badword chars */ 7970 { 7971 do 7972 { 7973 if ((score += SCORE_DEL) >= minscore) 7974 goto pop; /* do next alternative */ 7975 } while (badword[++bi] != NUL); 7976 minscore = score; 7977 } 7978 else if (bc == NUL) /* badword ends, insert badword chars */ 7979 { 7980 do 7981 { 7982 if ((score += SCORE_INS) >= minscore) 7983 goto pop; /* do next alternative */ 7984 } while (goodword[++gi] != NUL); 7985 minscore = score; 7986 } 7987 else /* both words continue */ 7988 { 7989 /* If not close to the limit, perform a change. Only try changes 7990 * that may lead to a lower score than "minscore". 7991 * round 0: try deleting a char from badword 7992 * round 1: try inserting a char in badword */ 7993 for (round = 0; round <= 1; ++round) 7994 { 7995 score_off = score + (round == 0 ? SCORE_DEL : SCORE_INS); 7996 if (score_off < minscore) 7997 { 7998 if (score_off + SCORE_EDIT_MIN >= minscore) 7999 { 8000 /* Near the limit, rest of the words must match. We 8001 * can check that right now, no need to push an item 8002 * onto the stack. */ 8003 bi2 = bi + 1 - round; 8004 gi2 = gi + round; 8005 while (goodword[gi2] == badword[bi2]) 8006 { 8007 if (goodword[gi2] == NUL) 8008 { 8009 minscore = score_off; 8010 break; 8011 } 8012 ++bi2; 8013 ++gi2; 8014 } 8015 } 8016 else 8017 { 8018 /* try deleting/inserting a character later */ 8019 stack[stackidx].badi = bi + 1 - round; 8020 stack[stackidx].goodi = gi + round; 8021 stack[stackidx].score = score_off; 8022 ++stackidx; 8023 } 8024 } 8025 } 8026 8027 if (score + SCORE_SWAP < minscore) 8028 { 8029 /* If swapping two characters makes a match then the 8030 * substitution is more expensive, thus there is no need to 8031 * try both. */ 8032 if (gc == badword[bi + 1] && bc == goodword[gi + 1]) 8033 { 8034 /* Swap two characters, that is: skip them. */ 8035 gi += 2; 8036 bi += 2; 8037 score += SCORE_SWAP; 8038 continue; 8039 } 8040 } 8041 8042 /* Substitute one character for another which is the same 8043 * thing as deleting a character from both goodword and badword. 8044 * Use a better score when there is only a case difference. */ 8045 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc)) 8046 score += SCORE_ICASE; 8047 else 8048 { 8049 /* For a similar character use SCORE_SIMILAR. */ 8050 if (slang != NULL 8051 && slang->sl_has_map 8052 && similar_chars(slang, gc, bc)) 8053 score += SCORE_SIMILAR; 8054 else 8055 score += SCORE_SUBST; 8056 } 8057 8058 if (score < minscore) 8059 { 8060 /* Do the substitution. */ 8061 ++gi; 8062 ++bi; 8063 continue; 8064 } 8065 } 8066 pop: 8067 /* 8068 * Get here to try the next alternative, pop it from the stack. 8069 */ 8070 if (stackidx == 0) /* stack is empty, finished */ 8071 break; 8072 8073 /* pop an item from the stack */ 8074 --stackidx; 8075 gi = stack[stackidx].goodi; 8076 bi = stack[stackidx].badi; 8077 score = stack[stackidx].score; 8078 } 8079 8080 /* When the score goes over "limit" it may actually be much higher. 8081 * Return a very large number to avoid going below the limit when giving a 8082 * bonus. */ 8083 if (minscore > limit) 8084 return SCORE_MAXMAX; 8085 return minscore; 8086 } 8087 8088 /* 8089 * Multi-byte version of spell_edit_score_limit(). 8090 * Keep it in sync with the above! 8091 */ 8092 static int 8093 spell_edit_score_limit_w( 8094 slang_T *slang, 8095 char_u *badword, 8096 char_u *goodword, 8097 int limit) 8098 { 8099 limitscore_T stack[10]; /* allow for over 3 * 2 edits */ 8100 int stackidx; 8101 int bi, gi; 8102 int bi2, gi2; 8103 int bc, gc; 8104 int score; 8105 int score_off; 8106 int minscore; 8107 int round; 8108 char_u *p; 8109 int wbadword[MAXWLEN]; 8110 int wgoodword[MAXWLEN]; 8111 8112 /* Get the characters from the multi-byte strings and put them in an 8113 * int array for easy access. */ 8114 bi = 0; 8115 for (p = badword; *p != NUL; ) 8116 wbadword[bi++] = mb_cptr2char_adv(&p); 8117 wbadword[bi++] = 0; 8118 gi = 0; 8119 for (p = goodword; *p != NUL; ) 8120 wgoodword[gi++] = mb_cptr2char_adv(&p); 8121 wgoodword[gi++] = 0; 8122 8123 /* 8124 * The idea is to go from start to end over the words. So long as 8125 * characters are equal just continue, this always gives the lowest score. 8126 * When there is a difference try several alternatives. Each alternative 8127 * increases "score" for the edit distance. Some of the alternatives are 8128 * pushed unto a stack and tried later, some are tried right away. At the 8129 * end of the word the score for one alternative is known. The lowest 8130 * possible score is stored in "minscore". 8131 */ 8132 stackidx = 0; 8133 bi = 0; 8134 gi = 0; 8135 score = 0; 8136 minscore = limit + 1; 8137 8138 for (;;) 8139 { 8140 /* Skip over an equal part, score remains the same. */ 8141 for (;;) 8142 { 8143 bc = wbadword[bi]; 8144 gc = wgoodword[gi]; 8145 8146 if (bc != gc) /* stop at a char that's different */ 8147 break; 8148 if (bc == NUL) /* both words end */ 8149 { 8150 if (score < minscore) 8151 minscore = score; 8152 goto pop; /* do next alternative */ 8153 } 8154 ++bi; 8155 ++gi; 8156 } 8157 8158 if (gc == NUL) /* goodword ends, delete badword chars */ 8159 { 8160 do 8161 { 8162 if ((score += SCORE_DEL) >= minscore) 8163 goto pop; /* do next alternative */ 8164 } while (wbadword[++bi] != NUL); 8165 minscore = score; 8166 } 8167 else if (bc == NUL) /* badword ends, insert badword chars */ 8168 { 8169 do 8170 { 8171 if ((score += SCORE_INS) >= minscore) 8172 goto pop; /* do next alternative */ 8173 } while (wgoodword[++gi] != NUL); 8174 minscore = score; 8175 } 8176 else /* both words continue */ 8177 { 8178 /* If not close to the limit, perform a change. Only try changes 8179 * that may lead to a lower score than "minscore". 8180 * round 0: try deleting a char from badword 8181 * round 1: try inserting a char in badword */ 8182 for (round = 0; round <= 1; ++round) 8183 { 8184 score_off = score + (round == 0 ? SCORE_DEL : SCORE_INS); 8185 if (score_off < minscore) 8186 { 8187 if (score_off + SCORE_EDIT_MIN >= minscore) 8188 { 8189 /* Near the limit, rest of the words must match. We 8190 * can check that right now, no need to push an item 8191 * onto the stack. */ 8192 bi2 = bi + 1 - round; 8193 gi2 = gi + round; 8194 while (wgoodword[gi2] == wbadword[bi2]) 8195 { 8196 if (wgoodword[gi2] == NUL) 8197 { 8198 minscore = score_off; 8199 break; 8200 } 8201 ++bi2; 8202 ++gi2; 8203 } 8204 } 8205 else 8206 { 8207 /* try deleting a character from badword later */ 8208 stack[stackidx].badi = bi + 1 - round; 8209 stack[stackidx].goodi = gi + round; 8210 stack[stackidx].score = score_off; 8211 ++stackidx; 8212 } 8213 } 8214 } 8215 8216 if (score + SCORE_SWAP < minscore) 8217 { 8218 /* If swapping two characters makes a match then the 8219 * substitution is more expensive, thus there is no need to 8220 * try both. */ 8221 if (gc == wbadword[bi + 1] && bc == wgoodword[gi + 1]) 8222 { 8223 /* Swap two characters, that is: skip them. */ 8224 gi += 2; 8225 bi += 2; 8226 score += SCORE_SWAP; 8227 continue; 8228 } 8229 } 8230 8231 /* Substitute one character for another which is the same 8232 * thing as deleting a character from both goodword and badword. 8233 * Use a better score when there is only a case difference. */ 8234 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc)) 8235 score += SCORE_ICASE; 8236 else 8237 { 8238 /* For a similar character use SCORE_SIMILAR. */ 8239 if (slang != NULL 8240 && slang->sl_has_map 8241 && similar_chars(slang, gc, bc)) 8242 score += SCORE_SIMILAR; 8243 else 8244 score += SCORE_SUBST; 8245 } 8246 8247 if (score < minscore) 8248 { 8249 /* Do the substitution. */ 8250 ++gi; 8251 ++bi; 8252 continue; 8253 } 8254 } 8255 pop: 8256 /* 8257 * Get here to try the next alternative, pop it from the stack. 8258 */ 8259 if (stackidx == 0) /* stack is empty, finished */ 8260 break; 8261 8262 /* pop an item from the stack */ 8263 --stackidx; 8264 gi = stack[stackidx].goodi; 8265 bi = stack[stackidx].badi; 8266 score = stack[stackidx].score; 8267 } 8268 8269 /* When the score goes over "limit" it may actually be much higher. 8270 * Return a very large number to avoid going below the limit when giving a 8271 * bonus. */ 8272 if (minscore > limit) 8273 return SCORE_MAXMAX; 8274 return minscore; 8275 } 8276 8277 /* 8278 * ":spellinfo" 8279 */ 8280 void 8281 ex_spellinfo(exarg_T *eap UNUSED) 8282 { 8283 int lpi; 8284 langp_T *lp; 8285 char_u *p; 8286 8287 if (no_spell_checking(curwin)) 8288 return; 8289 8290 msg_start(); 8291 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len && !got_int; ++lpi) 8292 { 8293 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 8294 msg_puts("file: "); 8295 msg_puts((char *)lp->lp_slang->sl_fname); 8296 msg_putchar('\n'); 8297 p = lp->lp_slang->sl_info; 8298 if (p != NULL) 8299 { 8300 msg_puts((char *)p); 8301 msg_putchar('\n'); 8302 } 8303 } 8304 msg_end(); 8305 } 8306 8307 #define DUMPFLAG_KEEPCASE 1 /* round 2: keep-case tree */ 8308 #define DUMPFLAG_COUNT 2 /* include word count */ 8309 #define DUMPFLAG_ICASE 4 /* ignore case when finding matches */ 8310 #define DUMPFLAG_ONECAP 8 /* pattern starts with capital */ 8311 #define DUMPFLAG_ALLCAP 16 /* pattern is all capitals */ 8312 8313 /* 8314 * ":spelldump" 8315 */ 8316 void 8317 ex_spelldump(exarg_T *eap) 8318 { 8319 char_u *spl; 8320 long dummy; 8321 8322 if (no_spell_checking(curwin)) 8323 return; 8324 get_option_value((char_u*)"spl", &dummy, &spl, OPT_LOCAL); 8325 8326 /* Create a new empty buffer in a new window. */ 8327 do_cmdline_cmd((char_u *)"new"); 8328 8329 /* enable spelling locally in the new window */ 8330 set_option_value((char_u*)"spell", TRUE, (char_u*)"", OPT_LOCAL); 8331 set_option_value((char_u*)"spl", dummy, spl, OPT_LOCAL); 8332 vim_free(spl); 8333 8334 if (!BUFEMPTY()) 8335 return; 8336 8337 spell_dump_compl(NULL, 0, NULL, eap->forceit ? DUMPFLAG_COUNT : 0); 8338 8339 /* Delete the empty line that we started with. */ 8340 if (curbuf->b_ml.ml_line_count > 1) 8341 ml_delete(curbuf->b_ml.ml_line_count, FALSE); 8342 8343 redraw_later(NOT_VALID); 8344 } 8345 8346 /* 8347 * Go through all possible words and: 8348 * 1. When "pat" is NULL: dump a list of all words in the current buffer. 8349 * "ic" and "dir" are not used. 8350 * 2. When "pat" is not NULL: add matching words to insert mode completion. 8351 */ 8352 void 8353 spell_dump_compl( 8354 char_u *pat, /* leading part of the word */ 8355 int ic, /* ignore case */ 8356 int *dir, /* direction for adding matches */ 8357 int dumpflags_arg) /* DUMPFLAG_* */ 8358 { 8359 langp_T *lp; 8360 slang_T *slang; 8361 idx_T arridx[MAXWLEN]; 8362 int curi[MAXWLEN]; 8363 char_u word[MAXWLEN]; 8364 int c; 8365 char_u *byts; 8366 idx_T *idxs; 8367 linenr_T lnum = 0; 8368 int round; 8369 int depth; 8370 int n; 8371 int flags; 8372 char_u *region_names = NULL; /* region names being used */ 8373 int do_region = TRUE; /* dump region names and numbers */ 8374 char_u *p; 8375 int lpi; 8376 int dumpflags = dumpflags_arg; 8377 int patlen; 8378 8379 /* When ignoring case or when the pattern starts with capital pass this on 8380 * to dump_word(). */ 8381 if (pat != NULL) 8382 { 8383 if (ic) 8384 dumpflags |= DUMPFLAG_ICASE; 8385 else 8386 { 8387 n = captype(pat, NULL); 8388 if (n == WF_ONECAP) 8389 dumpflags |= DUMPFLAG_ONECAP; 8390 else if (n == WF_ALLCAP && (int)STRLEN(pat) > mb_ptr2len(pat)) 8391 dumpflags |= DUMPFLAG_ALLCAP; 8392 } 8393 } 8394 8395 /* Find out if we can support regions: All languages must support the same 8396 * regions or none at all. */ 8397 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 8398 { 8399 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 8400 p = lp->lp_slang->sl_regions; 8401 if (p[0] != 0) 8402 { 8403 if (region_names == NULL) /* first language with regions */ 8404 region_names = p; 8405 else if (STRCMP(region_names, p) != 0) 8406 { 8407 do_region = FALSE; /* region names are different */ 8408 break; 8409 } 8410 } 8411 } 8412 8413 if (do_region && region_names != NULL) 8414 { 8415 if (pat == NULL) 8416 { 8417 vim_snprintf((char *)IObuff, IOSIZE, "/regions=%s", region_names); 8418 ml_append(lnum++, IObuff, (colnr_T)0, FALSE); 8419 } 8420 } 8421 else 8422 do_region = FALSE; 8423 8424 /* 8425 * Loop over all files loaded for the entries in 'spelllang'. 8426 */ 8427 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 8428 { 8429 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 8430 slang = lp->lp_slang; 8431 if (slang->sl_fbyts == NULL) /* reloading failed */ 8432 continue; 8433 8434 if (pat == NULL) 8435 { 8436 vim_snprintf((char *)IObuff, IOSIZE, "# file: %s", slang->sl_fname); 8437 ml_append(lnum++, IObuff, (colnr_T)0, FALSE); 8438 } 8439 8440 /* When matching with a pattern and there are no prefixes only use 8441 * parts of the tree that match "pat". */ 8442 if (pat != NULL && slang->sl_pbyts == NULL) 8443 patlen = (int)STRLEN(pat); 8444 else 8445 patlen = -1; 8446 8447 /* round 1: case-folded tree 8448 * round 2: keep-case tree */ 8449 for (round = 1; round <= 2; ++round) 8450 { 8451 if (round == 1) 8452 { 8453 dumpflags &= ~DUMPFLAG_KEEPCASE; 8454 byts = slang->sl_fbyts; 8455 idxs = slang->sl_fidxs; 8456 } 8457 else 8458 { 8459 dumpflags |= DUMPFLAG_KEEPCASE; 8460 byts = slang->sl_kbyts; 8461 idxs = slang->sl_kidxs; 8462 } 8463 if (byts == NULL) 8464 continue; /* array is empty */ 8465 8466 depth = 0; 8467 arridx[0] = 0; 8468 curi[0] = 1; 8469 while (depth >= 0 && !got_int 8470 && (pat == NULL || !ins_compl_interrupted())) 8471 { 8472 if (curi[depth] > byts[arridx[depth]]) 8473 { 8474 /* Done all bytes at this node, go up one level. */ 8475 --depth; 8476 line_breakcheck(); 8477 ins_compl_check_keys(50, FALSE); 8478 } 8479 else 8480 { 8481 /* Do one more byte at this node. */ 8482 n = arridx[depth] + curi[depth]; 8483 ++curi[depth]; 8484 c = byts[n]; 8485 if (c == 0) 8486 { 8487 /* End of word, deal with the word. 8488 * Don't use keep-case words in the fold-case tree, 8489 * they will appear in the keep-case tree. 8490 * Only use the word when the region matches. */ 8491 flags = (int)idxs[n]; 8492 if ((round == 2 || (flags & WF_KEEPCAP) == 0) 8493 && (flags & WF_NEEDCOMP) == 0 8494 && (do_region 8495 || (flags & WF_REGION) == 0 8496 || (((unsigned)flags >> 16) 8497 & lp->lp_region) != 0)) 8498 { 8499 word[depth] = NUL; 8500 if (!do_region) 8501 flags &= ~WF_REGION; 8502 8503 /* Dump the basic word if there is no prefix or 8504 * when it's the first one. */ 8505 c = (unsigned)flags >> 24; 8506 if (c == 0 || curi[depth] == 2) 8507 { 8508 dump_word(slang, word, pat, dir, 8509 dumpflags, flags, lnum); 8510 if (pat == NULL) 8511 ++lnum; 8512 } 8513 8514 /* Apply the prefix, if there is one. */ 8515 if (c != 0) 8516 lnum = dump_prefixes(slang, word, pat, dir, 8517 dumpflags, flags, lnum); 8518 } 8519 } 8520 else 8521 { 8522 /* Normal char, go one level deeper. */ 8523 word[depth++] = c; 8524 arridx[depth] = idxs[n]; 8525 curi[depth] = 1; 8526 8527 /* Check if this characters matches with the pattern. 8528 * If not skip the whole tree below it. 8529 * Always ignore case here, dump_word() will check 8530 * proper case later. This isn't exactly right when 8531 * length changes for multi-byte characters with 8532 * ignore case... */ 8533 if (depth <= patlen 8534 && MB_STRNICMP(word, pat, depth) != 0) 8535 --depth; 8536 } 8537 } 8538 } 8539 } 8540 } 8541 } 8542 8543 /* 8544 * Dump one word: apply case modifications and append a line to the buffer. 8545 * When "lnum" is zero add insert mode completion. 8546 */ 8547 static void 8548 dump_word( 8549 slang_T *slang, 8550 char_u *word, 8551 char_u *pat, 8552 int *dir, 8553 int dumpflags, 8554 int wordflags, 8555 linenr_T lnum) 8556 { 8557 int keepcap = FALSE; 8558 char_u *p; 8559 char_u *tw; 8560 char_u cword[MAXWLEN]; 8561 char_u badword[MAXWLEN + 10]; 8562 int i; 8563 int flags = wordflags; 8564 8565 if (dumpflags & DUMPFLAG_ONECAP) 8566 flags |= WF_ONECAP; 8567 if (dumpflags & DUMPFLAG_ALLCAP) 8568 flags |= WF_ALLCAP; 8569 8570 if ((dumpflags & DUMPFLAG_KEEPCASE) == 0 && (flags & WF_CAPMASK) != 0) 8571 { 8572 /* Need to fix case according to "flags". */ 8573 make_case_word(word, cword, flags); 8574 p = cword; 8575 } 8576 else 8577 { 8578 p = word; 8579 if ((dumpflags & DUMPFLAG_KEEPCASE) 8580 && ((captype(word, NULL) & WF_KEEPCAP) == 0 8581 || (flags & WF_FIXCAP) != 0)) 8582 keepcap = TRUE; 8583 } 8584 tw = p; 8585 8586 if (pat == NULL) 8587 { 8588 /* Add flags and regions after a slash. */ 8589 if ((flags & (WF_BANNED | WF_RARE | WF_REGION)) || keepcap) 8590 { 8591 STRCPY(badword, p); 8592 STRCAT(badword, "/"); 8593 if (keepcap) 8594 STRCAT(badword, "="); 8595 if (flags & WF_BANNED) 8596 STRCAT(badword, "!"); 8597 else if (flags & WF_RARE) 8598 STRCAT(badword, "?"); 8599 if (flags & WF_REGION) 8600 for (i = 0; i < 7; ++i) 8601 if (flags & (0x10000 << i)) 8602 sprintf((char *)badword + STRLEN(badword), "%d", i + 1); 8603 p = badword; 8604 } 8605 8606 if (dumpflags & DUMPFLAG_COUNT) 8607 { 8608 hashitem_T *hi; 8609 8610 /* Include the word count for ":spelldump!". */ 8611 hi = hash_find(&slang->sl_wordcount, tw); 8612 if (!HASHITEM_EMPTY(hi)) 8613 { 8614 vim_snprintf((char *)IObuff, IOSIZE, "%s\t%d", 8615 tw, HI2WC(hi)->wc_count); 8616 p = IObuff; 8617 } 8618 } 8619 8620 ml_append(lnum, p, (colnr_T)0, FALSE); 8621 } 8622 else if (((dumpflags & DUMPFLAG_ICASE) 8623 ? MB_STRNICMP(p, pat, STRLEN(pat)) == 0 8624 : STRNCMP(p, pat, STRLEN(pat)) == 0) 8625 && ins_compl_add_infercase(p, (int)STRLEN(p), 8626 p_ic, NULL, *dir, FALSE) == OK) 8627 /* if dir was BACKWARD then honor it just once */ 8628 *dir = FORWARD; 8629 } 8630 8631 /* 8632 * For ":spelldump": Find matching prefixes for "word". Prepend each to 8633 * "word" and append a line to the buffer. 8634 * When "lnum" is zero add insert mode completion. 8635 * Return the updated line number. 8636 */ 8637 static linenr_T 8638 dump_prefixes( 8639 slang_T *slang, 8640 char_u *word, /* case-folded word */ 8641 char_u *pat, 8642 int *dir, 8643 int dumpflags, 8644 int flags, /* flags with prefix ID */ 8645 linenr_T startlnum) 8646 { 8647 idx_T arridx[MAXWLEN]; 8648 int curi[MAXWLEN]; 8649 char_u prefix[MAXWLEN]; 8650 char_u word_up[MAXWLEN]; 8651 int has_word_up = FALSE; 8652 int c; 8653 char_u *byts; 8654 idx_T *idxs; 8655 linenr_T lnum = startlnum; 8656 int depth; 8657 int n; 8658 int len; 8659 int i; 8660 8661 /* If the word starts with a lower-case letter make the word with an 8662 * upper-case letter in word_up[]. */ 8663 c = PTR2CHAR(word); 8664 if (SPELL_TOUPPER(c) != c) 8665 { 8666 onecap_copy(word, word_up, TRUE); 8667 has_word_up = TRUE; 8668 } 8669 8670 byts = slang->sl_pbyts; 8671 idxs = slang->sl_pidxs; 8672 if (byts != NULL) /* array not is empty */ 8673 { 8674 /* 8675 * Loop over all prefixes, building them byte-by-byte in prefix[]. 8676 * When at the end of a prefix check that it supports "flags". 8677 */ 8678 depth = 0; 8679 arridx[0] = 0; 8680 curi[0] = 1; 8681 while (depth >= 0 && !got_int) 8682 { 8683 n = arridx[depth]; 8684 len = byts[n]; 8685 if (curi[depth] > len) 8686 { 8687 /* Done all bytes at this node, go up one level. */ 8688 --depth; 8689 line_breakcheck(); 8690 } 8691 else 8692 { 8693 /* Do one more byte at this node. */ 8694 n += curi[depth]; 8695 ++curi[depth]; 8696 c = byts[n]; 8697 if (c == 0) 8698 { 8699 /* End of prefix, find out how many IDs there are. */ 8700 for (i = 1; i < len; ++i) 8701 if (byts[n + i] != 0) 8702 break; 8703 curi[depth] += i - 1; 8704 8705 c = valid_word_prefix(i, n, flags, word, slang, FALSE); 8706 if (c != 0) 8707 { 8708 vim_strncpy(prefix + depth, word, MAXWLEN - depth - 1); 8709 dump_word(slang, prefix, pat, dir, dumpflags, 8710 (c & WF_RAREPFX) ? (flags | WF_RARE) 8711 : flags, lnum); 8712 if (lnum != 0) 8713 ++lnum; 8714 } 8715 8716 /* Check for prefix that matches the word when the 8717 * first letter is upper-case, but only if the prefix has 8718 * a condition. */ 8719 if (has_word_up) 8720 { 8721 c = valid_word_prefix(i, n, flags, word_up, slang, 8722 TRUE); 8723 if (c != 0) 8724 { 8725 vim_strncpy(prefix + depth, word_up, 8726 MAXWLEN - depth - 1); 8727 dump_word(slang, prefix, pat, dir, dumpflags, 8728 (c & WF_RAREPFX) ? (flags | WF_RARE) 8729 : flags, lnum); 8730 if (lnum != 0) 8731 ++lnum; 8732 } 8733 } 8734 } 8735 else 8736 { 8737 /* Normal char, go one level deeper. */ 8738 prefix[depth++] = c; 8739 arridx[depth] = idxs[n]; 8740 curi[depth] = 1; 8741 } 8742 } 8743 } 8744 } 8745 8746 return lnum; 8747 } 8748 8749 /* 8750 * Move "p" to the end of word "start". 8751 * Uses the spell-checking word characters. 8752 */ 8753 char_u * 8754 spell_to_word_end(char_u *start, win_T *win) 8755 { 8756 char_u *p = start; 8757 8758 while (*p != NUL && spell_iswordp(p, win)) 8759 MB_PTR_ADV(p); 8760 return p; 8761 } 8762 8763 #if defined(FEAT_INS_EXPAND) || defined(PROTO) 8764 /* 8765 * For Insert mode completion CTRL-X s: 8766 * Find start of the word in front of column "startcol". 8767 * We don't check if it is badly spelled, with completion we can only change 8768 * the word in front of the cursor. 8769 * Returns the column number of the word. 8770 */ 8771 int 8772 spell_word_start(int startcol) 8773 { 8774 char_u *line; 8775 char_u *p; 8776 int col = 0; 8777 8778 if (no_spell_checking(curwin)) 8779 return startcol; 8780 8781 /* Find a word character before "startcol". */ 8782 line = ml_get_curline(); 8783 for (p = line + startcol; p > line; ) 8784 { 8785 MB_PTR_BACK(line, p); 8786 if (spell_iswordp_nmw(p, curwin)) 8787 break; 8788 } 8789 8790 /* Go back to start of the word. */ 8791 while (p > line) 8792 { 8793 col = (int)(p - line); 8794 MB_PTR_BACK(line, p); 8795 if (!spell_iswordp(p, curwin)) 8796 break; 8797 col = 0; 8798 } 8799 8800 return col; 8801 } 8802 8803 /* 8804 * Need to check for 'spellcapcheck' now, the word is removed before 8805 * expand_spelling() is called. Therefore the ugly global variable. 8806 */ 8807 static int spell_expand_need_cap; 8808 8809 void 8810 spell_expand_check_cap(colnr_T col) 8811 { 8812 spell_expand_need_cap = check_need_cap(curwin->w_cursor.lnum, col); 8813 } 8814 8815 /* 8816 * Get list of spelling suggestions. 8817 * Used for Insert mode completion CTRL-X ?. 8818 * Returns the number of matches. The matches are in "matchp[]", array of 8819 * allocated strings. 8820 */ 8821 int 8822 expand_spelling( 8823 linenr_T lnum UNUSED, 8824 char_u *pat, 8825 char_u ***matchp) 8826 { 8827 garray_T ga; 8828 8829 spell_suggest_list(&ga, pat, 100, spell_expand_need_cap, TRUE); 8830 *matchp = ga.ga_data; 8831 return ga.ga_len; 8832 } 8833 #endif 8834 8835 #endif /* FEAT_SPELL */ 8836