1 /* vi:set ts=8 sts=4 sw=4 noet: 2 * 3 * VIM - Vi IMproved by Bram Moolenaar 4 * 5 * Do ":help uganda" in Vim to read copying and usage conditions. 6 * Do ":help credits" in Vim to see a list of people who contributed. 7 * See README.txt for an overview of the Vim source code. 8 */ 9 10 /* 11 * spell.c: code for spell checking 12 * 13 * See spellfile.c for the Vim spell file format. 14 * 15 * The spell checking mechanism uses a tree (aka trie). Each node in the tree 16 * has a list of bytes that can appear (siblings). For each byte there is a 17 * pointer to the node with the byte that follows in the word (child). 18 * 19 * A NUL byte is used where the word may end. The bytes are sorted, so that 20 * binary searching can be used and the NUL bytes are at the start. The 21 * number of possible bytes is stored before the list of bytes. 22 * 23 * The tree uses two arrays: "byts" stores the characters, "idxs" stores 24 * either the next index or flags. The tree starts at index 0. For example, 25 * to lookup "vi" this sequence is followed: 26 * i = 0 27 * len = byts[i] 28 * n = where "v" appears in byts[i + 1] to byts[i + len] 29 * i = idxs[n] 30 * len = byts[i] 31 * n = where "i" appears in byts[i + 1] to byts[i + len] 32 * i = idxs[n] 33 * len = byts[i] 34 * find that byts[i + 1] is 0, idxs[i + 1] has flags for "vi". 35 * 36 * There are two word trees: one with case-folded words and one with words in 37 * original case. The second one is only used for keep-case words and is 38 * usually small. 39 * 40 * There is one additional tree for when not all prefixes are applied when 41 * generating the .spl file. This tree stores all the possible prefixes, as 42 * if they were words. At each word (prefix) end the prefix nr is stored, the 43 * following word must support this prefix nr. And the condition nr is 44 * stored, used to lookup the condition that the word must match with. 45 * 46 * Thanks to Olaf Seibert for providing an example implementation of this tree 47 * and the compression mechanism. 48 * LZ trie ideas: 49 * http://www.irb.hr/hr/home/ristov/papers/RistovLZtrieRevision1.pdf 50 * More papers: http://www-igm.univ-mlv.fr/~laporte/publi_en.html 51 * 52 * Matching involves checking the caps type: Onecap ALLCAP KeepCap. 53 * 54 * Why doesn't Vim use aspell/ispell/myspell/etc.? 55 * See ":help develop-spell". 56 */ 57 58 /* 59 * Use this to adjust the score after finding suggestions, based on the 60 * suggested word sounding like the bad word. This is much faster than doing 61 * it for every possible suggestion. 62 * Disadvantage: When "the" is typed as "hte" it sounds quite different ("@" 63 * vs "ht") and goes down in the list. 64 * Used when 'spellsuggest' is set to "best". 65 */ 66 #define RESCORE(word_score, sound_score) ((3 * word_score + sound_score) / 4) 67 68 /* 69 * Do the opposite: based on a maximum end score and a known sound score, 70 * compute the maximum word score that can be used. 71 */ 72 #define MAXSCORE(word_score, sound_score) ((4 * word_score - sound_score) / 3) 73 74 #define IN_SPELL_C 75 #include "vim.h" 76 77 #if defined(FEAT_SPELL) || defined(PROTO) 78 79 #ifndef UNIX /* it's in os_unix.h for Unix */ 80 # include <time.h> /* for time_t */ 81 #endif 82 83 /* only used for su_badflags */ 84 #define WF_MIXCAP 0x20 /* mix of upper and lower case: macaRONI */ 85 86 #define WF_CAPMASK (WF_ONECAP | WF_ALLCAP | WF_KEEPCAP | WF_FIXCAP) 87 88 #define REGION_ALL 0xff /* word valid in all regions */ 89 90 #define VIMSUGMAGIC "VIMsug" /* string at start of Vim .sug file */ 91 #define VIMSUGMAGICL 6 92 #define VIMSUGVERSION 1 93 94 /* Result values. Lower number is accepted over higher one. */ 95 #define SP_BANNED -1 96 #define SP_OK 0 97 #define SP_RARE 1 98 #define SP_LOCAL 2 99 #define SP_BAD 3 100 101 typedef struct wordcount_S 102 { 103 short_u wc_count; /* nr of times word was seen */ 104 char_u wc_word[1]; /* word, actually longer */ 105 } wordcount_T; 106 107 #define WC_KEY_OFF offsetof(wordcount_T, wc_word) 108 #define HI2WC(hi) ((wordcount_T *)((hi)->hi_key - WC_KEY_OFF)) 109 #define MAXWORDCOUNT 0xffff 110 111 /* 112 * Information used when looking for suggestions. 113 */ 114 typedef struct suginfo_S 115 { 116 garray_T su_ga; /* suggestions, contains "suggest_T" */ 117 int su_maxcount; /* max. number of suggestions displayed */ 118 int su_maxscore; /* maximum score for adding to su_ga */ 119 int su_sfmaxscore; /* idem, for when doing soundfold words */ 120 garray_T su_sga; /* like su_ga, sound-folded scoring */ 121 char_u *su_badptr; /* start of bad word in line */ 122 int su_badlen; /* length of detected bad word in line */ 123 int su_badflags; /* caps flags for bad word */ 124 char_u su_badword[MAXWLEN]; /* bad word truncated at su_badlen */ 125 char_u su_fbadword[MAXWLEN]; /* su_badword case-folded */ 126 char_u su_sal_badword[MAXWLEN]; /* su_badword soundfolded */ 127 hashtab_T su_banned; /* table with banned words */ 128 slang_T *su_sallang; /* default language for sound folding */ 129 } suginfo_T; 130 131 /* One word suggestion. Used in "si_ga". */ 132 typedef struct suggest_S 133 { 134 char_u *st_word; /* suggested word, allocated string */ 135 int st_wordlen; /* STRLEN(st_word) */ 136 int st_orglen; /* length of replaced text */ 137 int st_score; /* lower is better */ 138 int st_altscore; /* used when st_score compares equal */ 139 int st_salscore; /* st_score is for soundalike */ 140 int st_had_bonus; /* bonus already included in score */ 141 slang_T *st_slang; /* language used for sound folding */ 142 } suggest_T; 143 144 #define SUG(ga, i) (((suggest_T *)(ga).ga_data)[i]) 145 146 /* TRUE if a word appears in the list of banned words. */ 147 #define WAS_BANNED(su, word) (!HASHITEM_EMPTY(hash_find(&su->su_banned, word))) 148 149 /* Number of suggestions kept when cleaning up. We need to keep more than 150 * what is displayed, because when rescore_suggestions() is called the score 151 * may change and wrong suggestions may be removed later. */ 152 #define SUG_CLEAN_COUNT(su) ((su)->su_maxcount < 130 ? 150 : (su)->su_maxcount + 20) 153 154 /* Threshold for sorting and cleaning up suggestions. Don't want to keep lots 155 * of suggestions that are not going to be displayed. */ 156 #define SUG_MAX_COUNT(su) (SUG_CLEAN_COUNT(su) + 50) 157 158 /* score for various changes */ 159 #define SCORE_SPLIT 149 /* split bad word */ 160 #define SCORE_SPLIT_NO 249 /* split bad word with NOSPLITSUGS */ 161 #define SCORE_ICASE 52 /* slightly different case */ 162 #define SCORE_REGION 200 /* word is for different region */ 163 #define SCORE_RARE 180 /* rare word */ 164 #define SCORE_SWAP 75 /* swap two characters */ 165 #define SCORE_SWAP3 110 /* swap two characters in three */ 166 #define SCORE_REP 65 /* REP replacement */ 167 #define SCORE_SUBST 93 /* substitute a character */ 168 #define SCORE_SIMILAR 33 /* substitute a similar character */ 169 #define SCORE_SUBCOMP 33 /* substitute a composing character */ 170 #define SCORE_DEL 94 /* delete a character */ 171 #define SCORE_DELDUP 66 /* delete a duplicated character */ 172 #define SCORE_DELCOMP 28 /* delete a composing character */ 173 #define SCORE_INS 96 /* insert a character */ 174 #define SCORE_INSDUP 67 /* insert a duplicate character */ 175 #define SCORE_INSCOMP 30 /* insert a composing character */ 176 #define SCORE_NONWORD 103 /* change non-word to word char */ 177 178 #define SCORE_FILE 30 /* suggestion from a file */ 179 #define SCORE_MAXINIT 350 /* Initial maximum score: higher == slower. 180 * 350 allows for about three changes. */ 181 182 #define SCORE_COMMON1 30 /* subtracted for words seen before */ 183 #define SCORE_COMMON2 40 /* subtracted for words often seen */ 184 #define SCORE_COMMON3 50 /* subtracted for words very often seen */ 185 #define SCORE_THRES2 10 /* word count threshold for COMMON2 */ 186 #define SCORE_THRES3 100 /* word count threshold for COMMON3 */ 187 188 /* When trying changed soundfold words it becomes slow when trying more than 189 * two changes. With less then two changes it's slightly faster but we miss a 190 * few good suggestions. In rare cases we need to try three of four changes. 191 */ 192 #define SCORE_SFMAX1 200 /* maximum score for first try */ 193 #define SCORE_SFMAX2 300 /* maximum score for second try */ 194 #define SCORE_SFMAX3 400 /* maximum score for third try */ 195 196 #define SCORE_BIG SCORE_INS * 3 /* big difference */ 197 #define SCORE_MAXMAX 999999 /* accept any score */ 198 #define SCORE_LIMITMAX 350 /* for spell_edit_score_limit() */ 199 200 /* for spell_edit_score_limit() we need to know the minimum value of 201 * SCORE_ICASE, SCORE_SWAP, SCORE_DEL, SCORE_SIMILAR and SCORE_INS */ 202 #define SCORE_EDIT_MIN SCORE_SIMILAR 203 204 /* 205 * Structure to store info for word matching. 206 */ 207 typedef struct matchinf_S 208 { 209 langp_T *mi_lp; /* info for language and region */ 210 211 /* pointers to original text to be checked */ 212 char_u *mi_word; /* start of word being checked */ 213 char_u *mi_end; /* end of matching word so far */ 214 char_u *mi_fend; /* next char to be added to mi_fword */ 215 char_u *mi_cend; /* char after what was used for 216 mi_capflags */ 217 218 /* case-folded text */ 219 char_u mi_fword[MAXWLEN + 1]; /* mi_word case-folded */ 220 int mi_fwordlen; /* nr of valid bytes in mi_fword */ 221 222 /* for when checking word after a prefix */ 223 int mi_prefarridx; /* index in sl_pidxs with list of 224 affixID/condition */ 225 int mi_prefcnt; /* number of entries at mi_prefarridx */ 226 int mi_prefixlen; /* byte length of prefix */ 227 int mi_cprefixlen; /* byte length of prefix in original 228 case */ 229 230 /* for when checking a compound word */ 231 int mi_compoff; /* start of following word offset */ 232 char_u mi_compflags[MAXWLEN]; /* flags for compound words used */ 233 int mi_complen; /* nr of compound words used */ 234 int mi_compextra; /* nr of COMPOUNDROOT words */ 235 236 /* others */ 237 int mi_result; /* result so far: SP_BAD, SP_OK, etc. */ 238 int mi_capflags; /* WF_ONECAP WF_ALLCAP WF_KEEPCAP */ 239 win_T *mi_win; /* buffer being checked */ 240 241 /* for NOBREAK */ 242 int mi_result2; /* "mi_resul" without following word */ 243 char_u *mi_end2; /* "mi_end" without following word */ 244 } matchinf_T; 245 246 247 static int spell_iswordp(char_u *p, win_T *wp); 248 static int spell_mb_isword_class(int cl, win_T *wp); 249 250 /* 251 * For finding suggestions: At each node in the tree these states are tried: 252 */ 253 typedef enum 254 { 255 STATE_START = 0, /* At start of node check for NUL bytes (goodword 256 * ends); if badword ends there is a match, otherwise 257 * try splitting word. */ 258 STATE_NOPREFIX, /* try without prefix */ 259 STATE_SPLITUNDO, /* Undo splitting. */ 260 STATE_ENDNUL, /* Past NUL bytes at start of the node. */ 261 STATE_PLAIN, /* Use each byte of the node. */ 262 STATE_DEL, /* Delete a byte from the bad word. */ 263 STATE_INS_PREP, /* Prepare for inserting bytes. */ 264 STATE_INS, /* Insert a byte in the bad word. */ 265 STATE_SWAP, /* Swap two bytes. */ 266 STATE_UNSWAP, /* Undo swap two characters. */ 267 STATE_SWAP3, /* Swap two characters over three. */ 268 STATE_UNSWAP3, /* Undo Swap two characters over three. */ 269 STATE_UNROT3L, /* Undo rotate three characters left */ 270 STATE_UNROT3R, /* Undo rotate three characters right */ 271 STATE_REP_INI, /* Prepare for using REP items. */ 272 STATE_REP, /* Use matching REP items from the .aff file. */ 273 STATE_REP_UNDO, /* Undo a REP item replacement. */ 274 STATE_FINAL /* End of this node. */ 275 } state_T; 276 277 /* 278 * Struct to keep the state at each level in suggest_try_change(). 279 */ 280 typedef struct trystate_S 281 { 282 state_T ts_state; /* state at this level, STATE_ */ 283 int ts_score; /* score */ 284 idx_T ts_arridx; /* index in tree array, start of node */ 285 short ts_curi; /* index in list of child nodes */ 286 char_u ts_fidx; /* index in fword[], case-folded bad word */ 287 char_u ts_fidxtry; /* ts_fidx at which bytes may be changed */ 288 char_u ts_twordlen; /* valid length of tword[] */ 289 char_u ts_prefixdepth; /* stack depth for end of prefix or 290 * PFD_PREFIXTREE or PFD_NOPREFIX */ 291 char_u ts_flags; /* TSF_ flags */ 292 char_u ts_tcharlen; /* number of bytes in tword character */ 293 char_u ts_tcharidx; /* current byte index in tword character */ 294 char_u ts_isdiff; /* DIFF_ values */ 295 char_u ts_fcharstart; /* index in fword where badword char started */ 296 char_u ts_prewordlen; /* length of word in "preword[]" */ 297 char_u ts_splitoff; /* index in "tword" after last split */ 298 char_u ts_splitfidx; /* "ts_fidx" at word split */ 299 char_u ts_complen; /* nr of compound words used */ 300 char_u ts_compsplit; /* index for "compflags" where word was spit */ 301 char_u ts_save_badflags; /* su_badflags saved here */ 302 char_u ts_delidx; /* index in fword for char that was deleted, 303 valid when "ts_flags" has TSF_DIDDEL */ 304 } trystate_T; 305 306 /* values for ts_isdiff */ 307 #define DIFF_NONE 0 /* no different byte (yet) */ 308 #define DIFF_YES 1 /* different byte found */ 309 #define DIFF_INSERT 2 /* inserting character */ 310 311 /* values for ts_flags */ 312 #define TSF_PREFIXOK 1 /* already checked that prefix is OK */ 313 #define TSF_DIDSPLIT 2 /* tried split at this point */ 314 #define TSF_DIDDEL 4 /* did a delete, "ts_delidx" has index */ 315 316 /* special values ts_prefixdepth */ 317 #define PFD_NOPREFIX 0xff /* not using prefixes */ 318 #define PFD_PREFIXTREE 0xfe /* walking through the prefix tree */ 319 #define PFD_NOTSPECIAL 0xfd /* highest value that's not special */ 320 321 /* mode values for find_word */ 322 #define FIND_FOLDWORD 0 /* find word case-folded */ 323 #define FIND_KEEPWORD 1 /* find keep-case word */ 324 #define FIND_PREFIX 2 /* find word after prefix */ 325 #define FIND_COMPOUND 3 /* find case-folded compound word */ 326 #define FIND_KEEPCOMPOUND 4 /* find keep-case compound word */ 327 328 static void find_word(matchinf_T *mip, int mode); 329 static int match_checkcompoundpattern(char_u *ptr, int wlen, garray_T *gap); 330 static int can_compound(slang_T *slang, char_u *word, char_u *flags); 331 static int match_compoundrule(slang_T *slang, char_u *compflags); 332 static int valid_word_prefix(int totprefcnt, int arridx, int flags, char_u *word, slang_T *slang, int cond_req); 333 static void find_prefix(matchinf_T *mip, int mode); 334 static int fold_more(matchinf_T *mip); 335 static int spell_valid_case(int wordflags, int treeflags); 336 static void spell_load_cb(char_u *fname, void *cookie); 337 static int count_syllables(slang_T *slang, char_u *word); 338 static void clear_midword(win_T *buf); 339 static void use_midword(slang_T *lp, win_T *buf); 340 static int find_region(char_u *rp, char_u *region); 341 static int check_need_cap(linenr_T lnum, colnr_T col); 342 static void spell_find_suggest(char_u *badptr, int badlen, suginfo_T *su, int maxcount, int banbadword, int need_cap, int interactive); 343 #ifdef FEAT_EVAL 344 static void spell_suggest_expr(suginfo_T *su, char_u *expr); 345 #endif 346 static void spell_suggest_file(suginfo_T *su, char_u *fname); 347 static void spell_suggest_intern(suginfo_T *su, int interactive); 348 static void spell_find_cleanup(suginfo_T *su); 349 static void suggest_try_special(suginfo_T *su); 350 static void suggest_try_change(suginfo_T *su); 351 static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char_u *fword, int soundfold); 352 static void go_deeper(trystate_T *stack, int depth, int score_add); 353 static int nofold_len(char_u *fword, int flen, char_u *word); 354 static void find_keepcap_word(slang_T *slang, char_u *fword, char_u *kword); 355 static void score_comp_sal(suginfo_T *su); 356 static void score_combine(suginfo_T *su); 357 static int stp_sal_score(suggest_T *stp, suginfo_T *su, slang_T *slang, char_u *badsound); 358 static void suggest_try_soundalike_prep(void); 359 static void suggest_try_soundalike(suginfo_T *su); 360 static void suggest_try_soundalike_finish(void); 361 static void add_sound_suggest(suginfo_T *su, char_u *goodword, int score, langp_T *lp); 362 static int soundfold_find(slang_T *slang, char_u *word); 363 static void make_case_word(char_u *fword, char_u *cword, int flags); 364 static int similar_chars(slang_T *slang, int c1, int c2); 365 static void add_suggestion(suginfo_T *su, garray_T *gap, char_u *goodword, int badlen, int score, int altscore, int had_bonus, slang_T *slang, int maxsf); 366 static void check_suggestions(suginfo_T *su, garray_T *gap); 367 static void add_banned(suginfo_T *su, char_u *word); 368 static void rescore_suggestions(suginfo_T *su); 369 static void rescore_one(suginfo_T *su, suggest_T *stp); 370 static int cleanup_suggestions(garray_T *gap, int maxscore, int keep); 371 static void spell_soundfold_sofo(slang_T *slang, char_u *inword, char_u *res); 372 static void spell_soundfold_sal(slang_T *slang, char_u *inword, char_u *res); 373 static void spell_soundfold_wsal(slang_T *slang, char_u *inword, char_u *res); 374 static int soundalike_score(char_u *goodsound, char_u *badsound); 375 static int spell_edit_score(slang_T *slang, char_u *badword, char_u *goodword); 376 static int spell_edit_score_limit(slang_T *slang, char_u *badword, char_u *goodword, int limit); 377 static int spell_edit_score_limit_w(slang_T *slang, char_u *badword, char_u *goodword, int limit); 378 static void dump_word(slang_T *slang, char_u *word, char_u *pat, int *dir, int round, int flags, linenr_T lnum); 379 static linenr_T dump_prefixes(slang_T *slang, char_u *word, char_u *pat, int *dir, int round, int flags, linenr_T startlnum); 380 381 382 /* Remember what "z?" replaced. */ 383 static char_u *repl_from = NULL; 384 static char_u *repl_to = NULL; 385 386 /* 387 * Main spell-checking function. 388 * "ptr" points to a character that could be the start of a word. 389 * "*attrp" is set to the highlight index for a badly spelled word. For a 390 * non-word or when it's OK it remains unchanged. 391 * This must only be called when 'spelllang' is not empty. 392 * 393 * "capcol" is used to check for a Capitalised word after the end of a 394 * sentence. If it's zero then perform the check. Return the column where to 395 * check next, or -1 when no sentence end was found. If it's NULL then don't 396 * worry. 397 * 398 * Returns the length of the word in bytes, also when it's OK, so that the 399 * caller can skip over the word. 400 */ 401 int 402 spell_check( 403 win_T *wp, /* current window */ 404 char_u *ptr, 405 hlf_T *attrp, 406 int *capcol, /* column to check for Capital */ 407 int docount) /* count good words */ 408 { 409 matchinf_T mi; /* Most things are put in "mi" so that it can 410 be passed to functions quickly. */ 411 int nrlen = 0; /* found a number first */ 412 int c; 413 int wrongcaplen = 0; 414 int lpi; 415 int count_word = docount; 416 417 /* A word never starts at a space or a control character. Return quickly 418 * then, skipping over the character. */ 419 if (*ptr <= ' ') 420 return 1; 421 422 /* Return here when loading language files failed. */ 423 if (wp->w_s->b_langp.ga_len == 0) 424 return 1; 425 426 vim_memset(&mi, 0, sizeof(matchinf_T)); 427 428 /* A number is always OK. Also skip hexadecimal numbers 0xFF99 and 429 * 0X99FF. But always do check spelling to find "3GPP" and "11 430 * julifeest". */ 431 if (*ptr >= '0' && *ptr <= '9') 432 { 433 if (*ptr == '0' && (ptr[1] == 'b' || ptr[1] == 'B')) 434 mi.mi_end = skipbin(ptr + 2); 435 else if (*ptr == '0' && (ptr[1] == 'x' || ptr[1] == 'X')) 436 mi.mi_end = skiphex(ptr + 2); 437 else 438 mi.mi_end = skipdigits(ptr); 439 nrlen = (int)(mi.mi_end - ptr); 440 } 441 442 /* Find the normal end of the word (until the next non-word character). */ 443 mi.mi_word = ptr; 444 mi.mi_fend = ptr; 445 if (spell_iswordp(mi.mi_fend, wp)) 446 { 447 do 448 MB_PTR_ADV(mi.mi_fend); 449 while (*mi.mi_fend != NUL && spell_iswordp(mi.mi_fend, wp)); 450 451 if (capcol != NULL && *capcol == 0 && wp->w_s->b_cap_prog != NULL) 452 { 453 /* Check word starting with capital letter. */ 454 c = PTR2CHAR(ptr); 455 if (!SPELL_ISUPPER(c)) 456 wrongcaplen = (int)(mi.mi_fend - ptr); 457 } 458 } 459 if (capcol != NULL) 460 *capcol = -1; 461 462 /* We always use the characters up to the next non-word character, 463 * also for bad words. */ 464 mi.mi_end = mi.mi_fend; 465 466 /* Check caps type later. */ 467 mi.mi_capflags = 0; 468 mi.mi_cend = NULL; 469 mi.mi_win = wp; 470 471 /* case-fold the word with one non-word character, so that we can check 472 * for the word end. */ 473 if (*mi.mi_fend != NUL) 474 MB_PTR_ADV(mi.mi_fend); 475 476 (void)spell_casefold(ptr, (int)(mi.mi_fend - ptr), mi.mi_fword, 477 MAXWLEN + 1); 478 mi.mi_fwordlen = (int)STRLEN(mi.mi_fword); 479 480 /* The word is bad unless we recognize it. */ 481 mi.mi_result = SP_BAD; 482 mi.mi_result2 = SP_BAD; 483 484 /* 485 * Loop over the languages specified in 'spelllang'. 486 * We check them all, because a word may be matched longer in another 487 * language. 488 */ 489 for (lpi = 0; lpi < wp->w_s->b_langp.ga_len; ++lpi) 490 { 491 mi.mi_lp = LANGP_ENTRY(wp->w_s->b_langp, lpi); 492 493 /* If reloading fails the language is still in the list but everything 494 * has been cleared. */ 495 if (mi.mi_lp->lp_slang->sl_fidxs == NULL) 496 continue; 497 498 /* Check for a matching word in case-folded words. */ 499 find_word(&mi, FIND_FOLDWORD); 500 501 /* Check for a matching word in keep-case words. */ 502 find_word(&mi, FIND_KEEPWORD); 503 504 /* Check for matching prefixes. */ 505 find_prefix(&mi, FIND_FOLDWORD); 506 507 /* For a NOBREAK language, may want to use a word without a following 508 * word as a backup. */ 509 if (mi.mi_lp->lp_slang->sl_nobreak && mi.mi_result == SP_BAD 510 && mi.mi_result2 != SP_BAD) 511 { 512 mi.mi_result = mi.mi_result2; 513 mi.mi_end = mi.mi_end2; 514 } 515 516 /* Count the word in the first language where it's found to be OK. */ 517 if (count_word && mi.mi_result == SP_OK) 518 { 519 count_common_word(mi.mi_lp->lp_slang, ptr, 520 (int)(mi.mi_end - ptr), 1); 521 count_word = FALSE; 522 } 523 } 524 525 if (mi.mi_result != SP_OK) 526 { 527 /* If we found a number skip over it. Allows for "42nd". Do flag 528 * rare and local words, e.g., "3GPP". */ 529 if (nrlen > 0) 530 { 531 if (mi.mi_result == SP_BAD || mi.mi_result == SP_BANNED) 532 return nrlen; 533 } 534 535 /* When we are at a non-word character there is no error, just 536 * skip over the character (try looking for a word after it). */ 537 else if (!spell_iswordp_nmw(ptr, wp)) 538 { 539 if (capcol != NULL && wp->w_s->b_cap_prog != NULL) 540 { 541 regmatch_T regmatch; 542 int r; 543 544 /* Check for end of sentence. */ 545 regmatch.regprog = wp->w_s->b_cap_prog; 546 regmatch.rm_ic = FALSE; 547 r = vim_regexec(®match, ptr, 0); 548 wp->w_s->b_cap_prog = regmatch.regprog; 549 if (r) 550 *capcol = (int)(regmatch.endp[0] - ptr); 551 } 552 553 if (has_mbyte) 554 return (*mb_ptr2len)(ptr); 555 return 1; 556 } 557 else if (mi.mi_end == ptr) 558 /* Always include at least one character. Required for when there 559 * is a mixup in "midword". */ 560 MB_PTR_ADV(mi.mi_end); 561 else if (mi.mi_result == SP_BAD 562 && LANGP_ENTRY(wp->w_s->b_langp, 0)->lp_slang->sl_nobreak) 563 { 564 char_u *p, *fp; 565 int save_result = mi.mi_result; 566 567 /* First language in 'spelllang' is NOBREAK. Find first position 568 * at which any word would be valid. */ 569 mi.mi_lp = LANGP_ENTRY(wp->w_s->b_langp, 0); 570 if (mi.mi_lp->lp_slang->sl_fidxs != NULL) 571 { 572 p = mi.mi_word; 573 fp = mi.mi_fword; 574 for (;;) 575 { 576 MB_PTR_ADV(p); 577 MB_PTR_ADV(fp); 578 if (p >= mi.mi_end) 579 break; 580 mi.mi_compoff = (int)(fp - mi.mi_fword); 581 find_word(&mi, FIND_COMPOUND); 582 if (mi.mi_result != SP_BAD) 583 { 584 mi.mi_end = p; 585 break; 586 } 587 } 588 mi.mi_result = save_result; 589 } 590 } 591 592 if (mi.mi_result == SP_BAD || mi.mi_result == SP_BANNED) 593 *attrp = HLF_SPB; 594 else if (mi.mi_result == SP_RARE) 595 *attrp = HLF_SPR; 596 else 597 *attrp = HLF_SPL; 598 } 599 600 if (wrongcaplen > 0 && (mi.mi_result == SP_OK || mi.mi_result == SP_RARE)) 601 { 602 /* Report SpellCap only when the word isn't badly spelled. */ 603 *attrp = HLF_SPC; 604 return wrongcaplen; 605 } 606 607 return (int)(mi.mi_end - ptr); 608 } 609 610 /* 611 * Check if the word at "mip->mi_word" is in the tree. 612 * When "mode" is FIND_FOLDWORD check in fold-case word tree. 613 * When "mode" is FIND_KEEPWORD check in keep-case word tree. 614 * When "mode" is FIND_PREFIX check for word after prefix in fold-case word 615 * tree. 616 * 617 * For a match mip->mi_result is updated. 618 */ 619 static void 620 find_word(matchinf_T *mip, int mode) 621 { 622 idx_T arridx = 0; 623 int endlen[MAXWLEN]; /* length at possible word endings */ 624 idx_T endidx[MAXWLEN]; /* possible word endings */ 625 int endidxcnt = 0; 626 int len; 627 int wlen = 0; 628 int flen; 629 int c; 630 char_u *ptr; 631 idx_T lo, hi, m; 632 char_u *s; 633 char_u *p; 634 int res = SP_BAD; 635 slang_T *slang = mip->mi_lp->lp_slang; 636 unsigned flags; 637 char_u *byts; 638 idx_T *idxs; 639 int word_ends; 640 int prefix_found; 641 int nobreak_result; 642 643 if (mode == FIND_KEEPWORD || mode == FIND_KEEPCOMPOUND) 644 { 645 /* Check for word with matching case in keep-case tree. */ 646 ptr = mip->mi_word; 647 flen = 9999; /* no case folding, always enough bytes */ 648 byts = slang->sl_kbyts; 649 idxs = slang->sl_kidxs; 650 651 if (mode == FIND_KEEPCOMPOUND) 652 /* Skip over the previously found word(s). */ 653 wlen += mip->mi_compoff; 654 } 655 else 656 { 657 /* Check for case-folded in case-folded tree. */ 658 ptr = mip->mi_fword; 659 flen = mip->mi_fwordlen; /* available case-folded bytes */ 660 byts = slang->sl_fbyts; 661 idxs = slang->sl_fidxs; 662 663 if (mode == FIND_PREFIX) 664 { 665 /* Skip over the prefix. */ 666 wlen = mip->mi_prefixlen; 667 flen -= mip->mi_prefixlen; 668 } 669 else if (mode == FIND_COMPOUND) 670 { 671 /* Skip over the previously found word(s). */ 672 wlen = mip->mi_compoff; 673 flen -= mip->mi_compoff; 674 } 675 676 } 677 678 if (byts == NULL) 679 return; /* array is empty */ 680 681 /* 682 * Repeat advancing in the tree until: 683 * - there is a byte that doesn't match, 684 * - we reach the end of the tree, 685 * - or we reach the end of the line. 686 */ 687 for (;;) 688 { 689 if (flen <= 0 && *mip->mi_fend != NUL) 690 flen = fold_more(mip); 691 692 len = byts[arridx++]; 693 694 /* If the first possible byte is a zero the word could end here. 695 * Remember this index, we first check for the longest word. */ 696 if (byts[arridx] == 0) 697 { 698 if (endidxcnt == MAXWLEN) 699 { 700 /* Must be a corrupted spell file. */ 701 emsg(_(e_format)); 702 return; 703 } 704 endlen[endidxcnt] = wlen; 705 endidx[endidxcnt++] = arridx++; 706 --len; 707 708 /* Skip over the zeros, there can be several flag/region 709 * combinations. */ 710 while (len > 0 && byts[arridx] == 0) 711 { 712 ++arridx; 713 --len; 714 } 715 if (len == 0) 716 break; /* no children, word must end here */ 717 } 718 719 /* Stop looking at end of the line. */ 720 if (ptr[wlen] == NUL) 721 break; 722 723 /* Perform a binary search in the list of accepted bytes. */ 724 c = ptr[wlen]; 725 if (c == TAB) /* <Tab> is handled like <Space> */ 726 c = ' '; 727 lo = arridx; 728 hi = arridx + len - 1; 729 while (lo < hi) 730 { 731 m = (lo + hi) / 2; 732 if (byts[m] > c) 733 hi = m - 1; 734 else if (byts[m] < c) 735 lo = m + 1; 736 else 737 { 738 lo = hi = m; 739 break; 740 } 741 } 742 743 /* Stop if there is no matching byte. */ 744 if (hi < lo || byts[lo] != c) 745 break; 746 747 /* Continue at the child (if there is one). */ 748 arridx = idxs[lo]; 749 ++wlen; 750 --flen; 751 752 /* One space in the good word may stand for several spaces in the 753 * checked word. */ 754 if (c == ' ') 755 { 756 for (;;) 757 { 758 if (flen <= 0 && *mip->mi_fend != NUL) 759 flen = fold_more(mip); 760 if (ptr[wlen] != ' ' && ptr[wlen] != TAB) 761 break; 762 ++wlen; 763 --flen; 764 } 765 } 766 } 767 768 /* 769 * Verify that one of the possible endings is valid. Try the longest 770 * first. 771 */ 772 while (endidxcnt > 0) 773 { 774 --endidxcnt; 775 arridx = endidx[endidxcnt]; 776 wlen = endlen[endidxcnt]; 777 778 if ((*mb_head_off)(ptr, ptr + wlen) > 0) 779 continue; /* not at first byte of character */ 780 if (spell_iswordp(ptr + wlen, mip->mi_win)) 781 { 782 if (slang->sl_compprog == NULL && !slang->sl_nobreak) 783 continue; /* next char is a word character */ 784 word_ends = FALSE; 785 } 786 else 787 word_ends = TRUE; 788 /* The prefix flag is before compound flags. Once a valid prefix flag 789 * has been found we try compound flags. */ 790 prefix_found = FALSE; 791 792 if (mode != FIND_KEEPWORD && has_mbyte) 793 { 794 /* Compute byte length in original word, length may change 795 * when folding case. This can be slow, take a shortcut when the 796 * case-folded word is equal to the keep-case word. */ 797 p = mip->mi_word; 798 if (STRNCMP(ptr, p, wlen) != 0) 799 { 800 for (s = ptr; s < ptr + wlen; MB_PTR_ADV(s)) 801 MB_PTR_ADV(p); 802 wlen = (int)(p - mip->mi_word); 803 } 804 } 805 806 /* Check flags and region. For FIND_PREFIX check the condition and 807 * prefix ID. 808 * Repeat this if there are more flags/region alternatives until there 809 * is a match. */ 810 res = SP_BAD; 811 for (len = byts[arridx - 1]; len > 0 && byts[arridx] == 0; 812 --len, ++arridx) 813 { 814 flags = idxs[arridx]; 815 816 /* For the fold-case tree check that the case of the checked word 817 * matches with what the word in the tree requires. 818 * For keep-case tree the case is always right. For prefixes we 819 * don't bother to check. */ 820 if (mode == FIND_FOLDWORD) 821 { 822 if (mip->mi_cend != mip->mi_word + wlen) 823 { 824 /* mi_capflags was set for a different word length, need 825 * to do it again. */ 826 mip->mi_cend = mip->mi_word + wlen; 827 mip->mi_capflags = captype(mip->mi_word, mip->mi_cend); 828 } 829 830 if (mip->mi_capflags == WF_KEEPCAP 831 || !spell_valid_case(mip->mi_capflags, flags)) 832 continue; 833 } 834 835 /* When mode is FIND_PREFIX the word must support the prefix: 836 * check the prefix ID and the condition. Do that for the list at 837 * mip->mi_prefarridx that find_prefix() filled. */ 838 else if (mode == FIND_PREFIX && !prefix_found) 839 { 840 c = valid_word_prefix(mip->mi_prefcnt, mip->mi_prefarridx, 841 flags, 842 mip->mi_word + mip->mi_cprefixlen, slang, 843 FALSE); 844 if (c == 0) 845 continue; 846 847 /* Use the WF_RARE flag for a rare prefix. */ 848 if (c & WF_RAREPFX) 849 flags |= WF_RARE; 850 prefix_found = TRUE; 851 } 852 853 if (slang->sl_nobreak) 854 { 855 if ((mode == FIND_COMPOUND || mode == FIND_KEEPCOMPOUND) 856 && (flags & WF_BANNED) == 0) 857 { 858 /* NOBREAK: found a valid following word. That's all we 859 * need to know, so return. */ 860 mip->mi_result = SP_OK; 861 break; 862 } 863 } 864 865 else if ((mode == FIND_COMPOUND || mode == FIND_KEEPCOMPOUND 866 || !word_ends)) 867 { 868 /* If there is no compound flag or the word is shorter than 869 * COMPOUNDMIN reject it quickly. 870 * Makes you wonder why someone puts a compound flag on a word 871 * that's too short... Myspell compatibility requires this 872 * anyway. */ 873 if (((unsigned)flags >> 24) == 0 874 || wlen - mip->mi_compoff < slang->sl_compminlen) 875 continue; 876 /* For multi-byte chars check character length against 877 * COMPOUNDMIN. */ 878 if (has_mbyte 879 && slang->sl_compminlen > 0 880 && mb_charlen_len(mip->mi_word + mip->mi_compoff, 881 wlen - mip->mi_compoff) < slang->sl_compminlen) 882 continue; 883 884 /* Limit the number of compound words to COMPOUNDWORDMAX if no 885 * maximum for syllables is specified. */ 886 if (!word_ends && mip->mi_complen + mip->mi_compextra + 2 887 > slang->sl_compmax 888 && slang->sl_compsylmax == MAXWLEN) 889 continue; 890 891 /* Don't allow compounding on a side where an affix was added, 892 * unless COMPOUNDPERMITFLAG was used. */ 893 if (mip->mi_complen > 0 && (flags & WF_NOCOMPBEF)) 894 continue; 895 if (!word_ends && (flags & WF_NOCOMPAFT)) 896 continue; 897 898 /* Quickly check if compounding is possible with this flag. */ 899 if (!byte_in_str(mip->mi_complen == 0 900 ? slang->sl_compstartflags 901 : slang->sl_compallflags, 902 ((unsigned)flags >> 24))) 903 continue; 904 905 /* If there is a match with a CHECKCOMPOUNDPATTERN rule 906 * discard the compound word. */ 907 if (match_checkcompoundpattern(ptr, wlen, &slang->sl_comppat)) 908 continue; 909 910 if (mode == FIND_COMPOUND) 911 { 912 int capflags; 913 914 /* Need to check the caps type of the appended compound 915 * word. */ 916 if (has_mbyte && STRNCMP(ptr, mip->mi_word, 917 mip->mi_compoff) != 0) 918 { 919 /* case folding may have changed the length */ 920 p = mip->mi_word; 921 for (s = ptr; s < ptr + mip->mi_compoff; MB_PTR_ADV(s)) 922 MB_PTR_ADV(p); 923 } 924 else 925 p = mip->mi_word + mip->mi_compoff; 926 capflags = captype(p, mip->mi_word + wlen); 927 if (capflags == WF_KEEPCAP || (capflags == WF_ALLCAP 928 && (flags & WF_FIXCAP) != 0)) 929 continue; 930 931 if (capflags != WF_ALLCAP) 932 { 933 /* When the character before the word is a word 934 * character we do not accept a Onecap word. We do 935 * accept a no-caps word, even when the dictionary 936 * word specifies ONECAP. */ 937 MB_PTR_BACK(mip->mi_word, p); 938 if (spell_iswordp_nmw(p, mip->mi_win) 939 ? capflags == WF_ONECAP 940 : (flags & WF_ONECAP) != 0 941 && capflags != WF_ONECAP) 942 continue; 943 } 944 } 945 946 /* If the word ends the sequence of compound flags of the 947 * words must match with one of the COMPOUNDRULE items and 948 * the number of syllables must not be too large. */ 949 mip->mi_compflags[mip->mi_complen] = ((unsigned)flags >> 24); 950 mip->mi_compflags[mip->mi_complen + 1] = NUL; 951 if (word_ends) 952 { 953 char_u fword[MAXWLEN]; 954 955 if (slang->sl_compsylmax < MAXWLEN) 956 { 957 /* "fword" is only needed for checking syllables. */ 958 if (ptr == mip->mi_word) 959 (void)spell_casefold(ptr, wlen, fword, MAXWLEN); 960 else 961 vim_strncpy(fword, ptr, endlen[endidxcnt]); 962 } 963 if (!can_compound(slang, fword, mip->mi_compflags)) 964 continue; 965 } 966 else if (slang->sl_comprules != NULL 967 && !match_compoundrule(slang, mip->mi_compflags)) 968 /* The compound flags collected so far do not match any 969 * COMPOUNDRULE, discard the compounded word. */ 970 continue; 971 } 972 973 /* Check NEEDCOMPOUND: can't use word without compounding. */ 974 else if (flags & WF_NEEDCOMP) 975 continue; 976 977 nobreak_result = SP_OK; 978 979 if (!word_ends) 980 { 981 int save_result = mip->mi_result; 982 char_u *save_end = mip->mi_end; 983 langp_T *save_lp = mip->mi_lp; 984 int lpi; 985 986 /* Check that a valid word follows. If there is one and we 987 * are compounding, it will set "mi_result", thus we are 988 * always finished here. For NOBREAK we only check that a 989 * valid word follows. 990 * Recursive! */ 991 if (slang->sl_nobreak) 992 mip->mi_result = SP_BAD; 993 994 /* Find following word in case-folded tree. */ 995 mip->mi_compoff = endlen[endidxcnt]; 996 if (has_mbyte && mode == FIND_KEEPWORD) 997 { 998 /* Compute byte length in case-folded word from "wlen": 999 * byte length in keep-case word. Length may change when 1000 * folding case. This can be slow, take a shortcut when 1001 * the case-folded word is equal to the keep-case word. */ 1002 p = mip->mi_fword; 1003 if (STRNCMP(ptr, p, wlen) != 0) 1004 { 1005 for (s = ptr; s < ptr + wlen; MB_PTR_ADV(s)) 1006 MB_PTR_ADV(p); 1007 mip->mi_compoff = (int)(p - mip->mi_fword); 1008 } 1009 } 1010 #if 0 /* Disabled, see below */ 1011 c = mip->mi_compoff; 1012 #endif 1013 ++mip->mi_complen; 1014 if (flags & WF_COMPROOT) 1015 ++mip->mi_compextra; 1016 1017 /* For NOBREAK we need to try all NOBREAK languages, at least 1018 * to find the ".add" file(s). */ 1019 for (lpi = 0; lpi < mip->mi_win->w_s->b_langp.ga_len; ++lpi) 1020 { 1021 if (slang->sl_nobreak) 1022 { 1023 mip->mi_lp = LANGP_ENTRY(mip->mi_win->w_s->b_langp, lpi); 1024 if (mip->mi_lp->lp_slang->sl_fidxs == NULL 1025 || !mip->mi_lp->lp_slang->sl_nobreak) 1026 continue; 1027 } 1028 1029 find_word(mip, FIND_COMPOUND); 1030 1031 /* When NOBREAK any word that matches is OK. Otherwise we 1032 * need to find the longest match, thus try with keep-case 1033 * and prefix too. */ 1034 if (!slang->sl_nobreak || mip->mi_result == SP_BAD) 1035 { 1036 /* Find following word in keep-case tree. */ 1037 mip->mi_compoff = wlen; 1038 find_word(mip, FIND_KEEPCOMPOUND); 1039 1040 #if 0 /* Disabled, a prefix must not appear halfway a compound word, 1041 unless the COMPOUNDPERMITFLAG is used and then it can't be a 1042 postponed prefix. */ 1043 if (!slang->sl_nobreak || mip->mi_result == SP_BAD) 1044 { 1045 /* Check for following word with prefix. */ 1046 mip->mi_compoff = c; 1047 find_prefix(mip, FIND_COMPOUND); 1048 } 1049 #endif 1050 } 1051 1052 if (!slang->sl_nobreak) 1053 break; 1054 } 1055 --mip->mi_complen; 1056 if (flags & WF_COMPROOT) 1057 --mip->mi_compextra; 1058 mip->mi_lp = save_lp; 1059 1060 if (slang->sl_nobreak) 1061 { 1062 nobreak_result = mip->mi_result; 1063 mip->mi_result = save_result; 1064 mip->mi_end = save_end; 1065 } 1066 else 1067 { 1068 if (mip->mi_result == SP_OK) 1069 break; 1070 continue; 1071 } 1072 } 1073 1074 if (flags & WF_BANNED) 1075 res = SP_BANNED; 1076 else if (flags & WF_REGION) 1077 { 1078 /* Check region. */ 1079 if ((mip->mi_lp->lp_region & (flags >> 16)) != 0) 1080 res = SP_OK; 1081 else 1082 res = SP_LOCAL; 1083 } 1084 else if (flags & WF_RARE) 1085 res = SP_RARE; 1086 else 1087 res = SP_OK; 1088 1089 /* Always use the longest match and the best result. For NOBREAK 1090 * we separately keep the longest match without a following good 1091 * word as a fall-back. */ 1092 if (nobreak_result == SP_BAD) 1093 { 1094 if (mip->mi_result2 > res) 1095 { 1096 mip->mi_result2 = res; 1097 mip->mi_end2 = mip->mi_word + wlen; 1098 } 1099 else if (mip->mi_result2 == res 1100 && mip->mi_end2 < mip->mi_word + wlen) 1101 mip->mi_end2 = mip->mi_word + wlen; 1102 } 1103 else if (mip->mi_result > res) 1104 { 1105 mip->mi_result = res; 1106 mip->mi_end = mip->mi_word + wlen; 1107 } 1108 else if (mip->mi_result == res && mip->mi_end < mip->mi_word + wlen) 1109 mip->mi_end = mip->mi_word + wlen; 1110 1111 if (mip->mi_result == SP_OK) 1112 break; 1113 } 1114 1115 if (mip->mi_result == SP_OK) 1116 break; 1117 } 1118 } 1119 1120 /* 1121 * Return TRUE if there is a match between the word ptr[wlen] and 1122 * CHECKCOMPOUNDPATTERN rules, assuming that we will concatenate with another 1123 * word. 1124 * A match means that the first part of CHECKCOMPOUNDPATTERN matches at the 1125 * end of ptr[wlen] and the second part matches after it. 1126 */ 1127 static int 1128 match_checkcompoundpattern( 1129 char_u *ptr, 1130 int wlen, 1131 garray_T *gap) /* &sl_comppat */ 1132 { 1133 int i; 1134 char_u *p; 1135 int len; 1136 1137 for (i = 0; i + 1 < gap->ga_len; i += 2) 1138 { 1139 p = ((char_u **)gap->ga_data)[i + 1]; 1140 if (STRNCMP(ptr + wlen, p, STRLEN(p)) == 0) 1141 { 1142 /* Second part matches at start of following compound word, now 1143 * check if first part matches at end of previous word. */ 1144 p = ((char_u **)gap->ga_data)[i]; 1145 len = (int)STRLEN(p); 1146 if (len <= wlen && STRNCMP(ptr + wlen - len, p, len) == 0) 1147 return TRUE; 1148 } 1149 } 1150 return FALSE; 1151 } 1152 1153 /* 1154 * Return TRUE if "flags" is a valid sequence of compound flags and "word" 1155 * does not have too many syllables. 1156 */ 1157 static int 1158 can_compound(slang_T *slang, char_u *word, char_u *flags) 1159 { 1160 char_u uflags[MAXWLEN * 2]; 1161 int i; 1162 char_u *p; 1163 1164 if (slang->sl_compprog == NULL) 1165 return FALSE; 1166 if (enc_utf8) 1167 { 1168 /* Need to convert the single byte flags to utf8 characters. */ 1169 p = uflags; 1170 for (i = 0; flags[i] != NUL; ++i) 1171 p += utf_char2bytes(flags[i], p); 1172 *p = NUL; 1173 p = uflags; 1174 } 1175 else 1176 p = flags; 1177 if (!vim_regexec_prog(&slang->sl_compprog, FALSE, p, 0)) 1178 return FALSE; 1179 1180 /* Count the number of syllables. This may be slow, do it last. If there 1181 * are too many syllables AND the number of compound words is above 1182 * COMPOUNDWORDMAX then compounding is not allowed. */ 1183 if (slang->sl_compsylmax < MAXWLEN 1184 && count_syllables(slang, word) > slang->sl_compsylmax) 1185 return (int)STRLEN(flags) < slang->sl_compmax; 1186 return TRUE; 1187 } 1188 1189 /* 1190 * Return TRUE when the sequence of flags in "compflags" plus "flag" can 1191 * possibly form a valid compounded word. This also checks the COMPOUNDRULE 1192 * lines if they don't contain wildcards. 1193 */ 1194 static int 1195 can_be_compound( 1196 trystate_T *sp, 1197 slang_T *slang, 1198 char_u *compflags, 1199 int flag) 1200 { 1201 /* If the flag doesn't appear in sl_compstartflags or sl_compallflags 1202 * then it can't possibly compound. */ 1203 if (!byte_in_str(sp->ts_complen == sp->ts_compsplit 1204 ? slang->sl_compstartflags : slang->sl_compallflags, flag)) 1205 return FALSE; 1206 1207 /* If there are no wildcards, we can check if the flags collected so far 1208 * possibly can form a match with COMPOUNDRULE patterns. This only 1209 * makes sense when we have two or more words. */ 1210 if (slang->sl_comprules != NULL && sp->ts_complen > sp->ts_compsplit) 1211 { 1212 int v; 1213 1214 compflags[sp->ts_complen] = flag; 1215 compflags[sp->ts_complen + 1] = NUL; 1216 v = match_compoundrule(slang, compflags + sp->ts_compsplit); 1217 compflags[sp->ts_complen] = NUL; 1218 return v; 1219 } 1220 1221 return TRUE; 1222 } 1223 1224 1225 /* 1226 * Return TRUE if the compound flags in compflags[] match the start of any 1227 * compound rule. This is used to stop trying a compound if the flags 1228 * collected so far can't possibly match any compound rule. 1229 * Caller must check that slang->sl_comprules is not NULL. 1230 */ 1231 static int 1232 match_compoundrule(slang_T *slang, char_u *compflags) 1233 { 1234 char_u *p; 1235 int i; 1236 int c; 1237 1238 /* loop over all the COMPOUNDRULE entries */ 1239 for (p = slang->sl_comprules; *p != NUL; ++p) 1240 { 1241 /* loop over the flags in the compound word we have made, match 1242 * them against the current rule entry */ 1243 for (i = 0; ; ++i) 1244 { 1245 c = compflags[i]; 1246 if (c == NUL) 1247 /* found a rule that matches for the flags we have so far */ 1248 return TRUE; 1249 if (*p == '/' || *p == NUL) 1250 break; /* end of rule, it's too short */ 1251 if (*p == '[') 1252 { 1253 int match = FALSE; 1254 1255 /* compare against all the flags in [] */ 1256 ++p; 1257 while (*p != ']' && *p != NUL) 1258 if (*p++ == c) 1259 match = TRUE; 1260 if (!match) 1261 break; /* none matches */ 1262 } 1263 else if (*p != c) 1264 break; /* flag of word doesn't match flag in pattern */ 1265 ++p; 1266 } 1267 1268 /* Skip to the next "/", where the next pattern starts. */ 1269 p = vim_strchr(p, '/'); 1270 if (p == NULL) 1271 break; 1272 } 1273 1274 /* Checked all the rules and none of them match the flags, so there 1275 * can't possibly be a compound starting with these flags. */ 1276 return FALSE; 1277 } 1278 1279 /* 1280 * Return non-zero if the prefix indicated by "arridx" matches with the prefix 1281 * ID in "flags" for the word "word". 1282 * The WF_RAREPFX flag is included in the return value for a rare prefix. 1283 */ 1284 static int 1285 valid_word_prefix( 1286 int totprefcnt, /* nr of prefix IDs */ 1287 int arridx, /* idx in sl_pidxs[] */ 1288 int flags, 1289 char_u *word, 1290 slang_T *slang, 1291 int cond_req) /* only use prefixes with a condition */ 1292 { 1293 int prefcnt; 1294 int pidx; 1295 regprog_T **rp; 1296 int prefid; 1297 1298 prefid = (unsigned)flags >> 24; 1299 for (prefcnt = totprefcnt - 1; prefcnt >= 0; --prefcnt) 1300 { 1301 pidx = slang->sl_pidxs[arridx + prefcnt]; 1302 1303 /* Check the prefix ID. */ 1304 if (prefid != (pidx & 0xff)) 1305 continue; 1306 1307 /* Check if the prefix doesn't combine and the word already has a 1308 * suffix. */ 1309 if ((flags & WF_HAS_AFF) && (pidx & WF_PFX_NC)) 1310 continue; 1311 1312 /* Check the condition, if there is one. The condition index is 1313 * stored in the two bytes above the prefix ID byte. */ 1314 rp = &slang->sl_prefprog[((unsigned)pidx >> 8) & 0xffff]; 1315 if (*rp != NULL) 1316 { 1317 if (!vim_regexec_prog(rp, FALSE, word, 0)) 1318 continue; 1319 } 1320 else if (cond_req) 1321 continue; 1322 1323 /* It's a match! Return the WF_ flags. */ 1324 return pidx; 1325 } 1326 return 0; 1327 } 1328 1329 /* 1330 * Check if the word at "mip->mi_word" has a matching prefix. 1331 * If it does, then check the following word. 1332 * 1333 * If "mode" is "FIND_COMPOUND" then do the same after another word, find a 1334 * prefix in a compound word. 1335 * 1336 * For a match mip->mi_result is updated. 1337 */ 1338 static void 1339 find_prefix(matchinf_T *mip, int mode) 1340 { 1341 idx_T arridx = 0; 1342 int len; 1343 int wlen = 0; 1344 int flen; 1345 int c; 1346 char_u *ptr; 1347 idx_T lo, hi, m; 1348 slang_T *slang = mip->mi_lp->lp_slang; 1349 char_u *byts; 1350 idx_T *idxs; 1351 1352 byts = slang->sl_pbyts; 1353 if (byts == NULL) 1354 return; /* array is empty */ 1355 1356 /* We use the case-folded word here, since prefixes are always 1357 * case-folded. */ 1358 ptr = mip->mi_fword; 1359 flen = mip->mi_fwordlen; /* available case-folded bytes */ 1360 if (mode == FIND_COMPOUND) 1361 { 1362 /* Skip over the previously found word(s). */ 1363 ptr += mip->mi_compoff; 1364 flen -= mip->mi_compoff; 1365 } 1366 idxs = slang->sl_pidxs; 1367 1368 /* 1369 * Repeat advancing in the tree until: 1370 * - there is a byte that doesn't match, 1371 * - we reach the end of the tree, 1372 * - or we reach the end of the line. 1373 */ 1374 for (;;) 1375 { 1376 if (flen == 0 && *mip->mi_fend != NUL) 1377 flen = fold_more(mip); 1378 1379 len = byts[arridx++]; 1380 1381 /* If the first possible byte is a zero the prefix could end here. 1382 * Check if the following word matches and supports the prefix. */ 1383 if (byts[arridx] == 0) 1384 { 1385 /* There can be several prefixes with different conditions. We 1386 * try them all, since we don't know which one will give the 1387 * longest match. The word is the same each time, pass the list 1388 * of possible prefixes to find_word(). */ 1389 mip->mi_prefarridx = arridx; 1390 mip->mi_prefcnt = len; 1391 while (len > 0 && byts[arridx] == 0) 1392 { 1393 ++arridx; 1394 --len; 1395 } 1396 mip->mi_prefcnt -= len; 1397 1398 /* Find the word that comes after the prefix. */ 1399 mip->mi_prefixlen = wlen; 1400 if (mode == FIND_COMPOUND) 1401 /* Skip over the previously found word(s). */ 1402 mip->mi_prefixlen += mip->mi_compoff; 1403 1404 if (has_mbyte) 1405 { 1406 /* Case-folded length may differ from original length. */ 1407 mip->mi_cprefixlen = nofold_len(mip->mi_fword, 1408 mip->mi_prefixlen, mip->mi_word); 1409 } 1410 else 1411 mip->mi_cprefixlen = mip->mi_prefixlen; 1412 find_word(mip, FIND_PREFIX); 1413 1414 1415 if (len == 0) 1416 break; /* no children, word must end here */ 1417 } 1418 1419 /* Stop looking at end of the line. */ 1420 if (ptr[wlen] == NUL) 1421 break; 1422 1423 /* Perform a binary search in the list of accepted bytes. */ 1424 c = ptr[wlen]; 1425 lo = arridx; 1426 hi = arridx + len - 1; 1427 while (lo < hi) 1428 { 1429 m = (lo + hi) / 2; 1430 if (byts[m] > c) 1431 hi = m - 1; 1432 else if (byts[m] < c) 1433 lo = m + 1; 1434 else 1435 { 1436 lo = hi = m; 1437 break; 1438 } 1439 } 1440 1441 /* Stop if there is no matching byte. */ 1442 if (hi < lo || byts[lo] != c) 1443 break; 1444 1445 /* Continue at the child (if there is one). */ 1446 arridx = idxs[lo]; 1447 ++wlen; 1448 --flen; 1449 } 1450 } 1451 1452 /* 1453 * Need to fold at least one more character. Do until next non-word character 1454 * for efficiency. Include the non-word character too. 1455 * Return the length of the folded chars in bytes. 1456 */ 1457 static int 1458 fold_more(matchinf_T *mip) 1459 { 1460 int flen; 1461 char_u *p; 1462 1463 p = mip->mi_fend; 1464 do 1465 MB_PTR_ADV(mip->mi_fend); 1466 while (*mip->mi_fend != NUL && spell_iswordp(mip->mi_fend, mip->mi_win)); 1467 1468 /* Include the non-word character so that we can check for the word end. */ 1469 if (*mip->mi_fend != NUL) 1470 MB_PTR_ADV(mip->mi_fend); 1471 1472 (void)spell_casefold(p, (int)(mip->mi_fend - p), 1473 mip->mi_fword + mip->mi_fwordlen, 1474 MAXWLEN - mip->mi_fwordlen); 1475 flen = (int)STRLEN(mip->mi_fword + mip->mi_fwordlen); 1476 mip->mi_fwordlen += flen; 1477 return flen; 1478 } 1479 1480 /* 1481 * Check case flags for a word. Return TRUE if the word has the requested 1482 * case. 1483 */ 1484 static int 1485 spell_valid_case( 1486 int wordflags, /* flags for the checked word. */ 1487 int treeflags) /* flags for the word in the spell tree */ 1488 { 1489 return ((wordflags == WF_ALLCAP && (treeflags & WF_FIXCAP) == 0) 1490 || ((treeflags & (WF_ALLCAP | WF_KEEPCAP)) == 0 1491 && ((treeflags & WF_ONECAP) == 0 1492 || (wordflags & WF_ONECAP) != 0))); 1493 } 1494 1495 /* 1496 * Return TRUE if spell checking is not enabled. 1497 */ 1498 static int 1499 no_spell_checking(win_T *wp) 1500 { 1501 if (!wp->w_p_spell || *wp->w_s->b_p_spl == NUL 1502 || wp->w_s->b_langp.ga_len == 0) 1503 { 1504 emsg(_("E756: Spell checking is not enabled")); 1505 return TRUE; 1506 } 1507 return FALSE; 1508 } 1509 1510 /* 1511 * Move to next spell error. 1512 * "curline" is FALSE for "[s", "]s", "[S" and "]S". 1513 * "curline" is TRUE to find word under/after cursor in the same line. 1514 * For Insert mode completion "dir" is BACKWARD and "curline" is TRUE: move 1515 * to after badly spelled word before the cursor. 1516 * Return 0 if not found, length of the badly spelled word otherwise. 1517 */ 1518 int 1519 spell_move_to( 1520 win_T *wp, 1521 int dir, /* FORWARD or BACKWARD */ 1522 int allwords, /* TRUE for "[s"/"]s", FALSE for "[S"/"]S" */ 1523 int curline, 1524 hlf_T *attrp) /* return: attributes of bad word or NULL 1525 (only when "dir" is FORWARD) */ 1526 { 1527 linenr_T lnum; 1528 pos_T found_pos; 1529 int found_len = 0; 1530 char_u *line; 1531 char_u *p; 1532 char_u *endp; 1533 hlf_T attr; 1534 int len; 1535 #ifdef FEAT_SYN_HL 1536 int has_syntax = syntax_present(wp); 1537 #endif 1538 int col; 1539 int can_spell; 1540 char_u *buf = NULL; 1541 int buflen = 0; 1542 int skip = 0; 1543 int capcol = -1; 1544 int found_one = FALSE; 1545 int wrapped = FALSE; 1546 1547 if (no_spell_checking(wp)) 1548 return 0; 1549 1550 /* 1551 * Start looking for bad word at the start of the line, because we can't 1552 * start halfway a word, we don't know where it starts or ends. 1553 * 1554 * When searching backwards, we continue in the line to find the last 1555 * bad word (in the cursor line: before the cursor). 1556 * 1557 * We concatenate the start of the next line, so that wrapped words work 1558 * (e.g. "et<line-break>cetera"). Doesn't work when searching backwards 1559 * though... 1560 */ 1561 lnum = wp->w_cursor.lnum; 1562 CLEAR_POS(&found_pos); 1563 1564 while (!got_int) 1565 { 1566 line = ml_get_buf(wp->w_buffer, lnum, FALSE); 1567 1568 len = (int)STRLEN(line); 1569 if (buflen < len + MAXWLEN + 2) 1570 { 1571 vim_free(buf); 1572 buflen = len + MAXWLEN + 2; 1573 buf = alloc(buflen); 1574 if (buf == NULL) 1575 break; 1576 } 1577 1578 /* In first line check first word for Capital. */ 1579 if (lnum == 1) 1580 capcol = 0; 1581 1582 /* For checking first word with a capital skip white space. */ 1583 if (capcol == 0) 1584 capcol = getwhitecols(line); 1585 else if (curline && wp == curwin) 1586 { 1587 /* For spellbadword(): check if first word needs a capital. */ 1588 col = getwhitecols(line); 1589 if (check_need_cap(lnum, col)) 1590 capcol = col; 1591 1592 /* Need to get the line again, may have looked at the previous 1593 * one. */ 1594 line = ml_get_buf(wp->w_buffer, lnum, FALSE); 1595 } 1596 1597 /* Copy the line into "buf" and append the start of the next line if 1598 * possible. */ 1599 STRCPY(buf, line); 1600 if (lnum < wp->w_buffer->b_ml.ml_line_count) 1601 spell_cat_line(buf + STRLEN(buf), 1602 ml_get_buf(wp->w_buffer, lnum + 1, FALSE), MAXWLEN); 1603 1604 p = buf + skip; 1605 endp = buf + len; 1606 while (p < endp) 1607 { 1608 /* When searching backward don't search after the cursor. Unless 1609 * we wrapped around the end of the buffer. */ 1610 if (dir == BACKWARD 1611 && lnum == wp->w_cursor.lnum 1612 && !wrapped 1613 && (colnr_T)(p - buf) >= wp->w_cursor.col) 1614 break; 1615 1616 /* start of word */ 1617 attr = HLF_COUNT; 1618 len = spell_check(wp, p, &attr, &capcol, FALSE); 1619 1620 if (attr != HLF_COUNT) 1621 { 1622 /* We found a bad word. Check the attribute. */ 1623 if (allwords || attr == HLF_SPB) 1624 { 1625 /* When searching forward only accept a bad word after 1626 * the cursor. */ 1627 if (dir == BACKWARD 1628 || lnum != wp->w_cursor.lnum 1629 || (lnum == wp->w_cursor.lnum 1630 && (wrapped 1631 || (colnr_T)(curline ? p - buf + len 1632 : p - buf) 1633 > wp->w_cursor.col))) 1634 { 1635 #ifdef FEAT_SYN_HL 1636 if (has_syntax) 1637 { 1638 col = (int)(p - buf); 1639 (void)syn_get_id(wp, lnum, (colnr_T)col, 1640 FALSE, &can_spell, FALSE); 1641 if (!can_spell) 1642 attr = HLF_COUNT; 1643 } 1644 else 1645 #endif 1646 can_spell = TRUE; 1647 1648 if (can_spell) 1649 { 1650 found_one = TRUE; 1651 found_pos.lnum = lnum; 1652 found_pos.col = (int)(p - buf); 1653 found_pos.coladd = 0; 1654 if (dir == FORWARD) 1655 { 1656 /* No need to search further. */ 1657 wp->w_cursor = found_pos; 1658 vim_free(buf); 1659 if (attrp != NULL) 1660 *attrp = attr; 1661 return len; 1662 } 1663 else if (curline) 1664 /* Insert mode completion: put cursor after 1665 * the bad word. */ 1666 found_pos.col += len; 1667 found_len = len; 1668 } 1669 } 1670 else 1671 found_one = TRUE; 1672 } 1673 } 1674 1675 /* advance to character after the word */ 1676 p += len; 1677 capcol -= len; 1678 } 1679 1680 if (dir == BACKWARD && found_pos.lnum != 0) 1681 { 1682 /* Use the last match in the line (before the cursor). */ 1683 wp->w_cursor = found_pos; 1684 vim_free(buf); 1685 return found_len; 1686 } 1687 1688 if (curline) 1689 break; /* only check cursor line */ 1690 1691 /* If we are back at the starting line and searched it again there 1692 * is no match, give up. */ 1693 if (lnum == wp->w_cursor.lnum && wrapped) 1694 break; 1695 1696 /* Advance to next line. */ 1697 if (dir == BACKWARD) 1698 { 1699 if (lnum > 1) 1700 --lnum; 1701 else if (!p_ws) 1702 break; /* at first line and 'nowrapscan' */ 1703 else 1704 { 1705 /* Wrap around to the end of the buffer. May search the 1706 * starting line again and accept the last match. */ 1707 lnum = wp->w_buffer->b_ml.ml_line_count; 1708 wrapped = TRUE; 1709 if (!shortmess(SHM_SEARCH)) 1710 give_warning((char_u *)_(top_bot_msg), TRUE); 1711 } 1712 capcol = -1; 1713 } 1714 else 1715 { 1716 if (lnum < wp->w_buffer->b_ml.ml_line_count) 1717 ++lnum; 1718 else if (!p_ws) 1719 break; /* at first line and 'nowrapscan' */ 1720 else 1721 { 1722 /* Wrap around to the start of the buffer. May search the 1723 * starting line again and accept the first match. */ 1724 lnum = 1; 1725 wrapped = TRUE; 1726 if (!shortmess(SHM_SEARCH)) 1727 give_warning((char_u *)_(bot_top_msg), TRUE); 1728 } 1729 1730 /* If we are back at the starting line and there is no match then 1731 * give up. */ 1732 if (lnum == wp->w_cursor.lnum && !found_one) 1733 break; 1734 1735 /* Skip the characters at the start of the next line that were 1736 * included in a match crossing line boundaries. */ 1737 if (attr == HLF_COUNT) 1738 skip = (int)(p - endp); 1739 else 1740 skip = 0; 1741 1742 /* Capcol skips over the inserted space. */ 1743 --capcol; 1744 1745 /* But after empty line check first word in next line */ 1746 if (*skipwhite(line) == NUL) 1747 capcol = 0; 1748 } 1749 1750 line_breakcheck(); 1751 } 1752 1753 vim_free(buf); 1754 return 0; 1755 } 1756 1757 /* 1758 * For spell checking: concatenate the start of the following line "line" into 1759 * "buf", blanking-out special characters. Copy less then "maxlen" bytes. 1760 * Keep the blanks at the start of the next line, this is used in win_line() 1761 * to skip those bytes if the word was OK. 1762 */ 1763 void 1764 spell_cat_line(char_u *buf, char_u *line, int maxlen) 1765 { 1766 char_u *p; 1767 int n; 1768 1769 p = skipwhite(line); 1770 while (vim_strchr((char_u *)"*#/\"\t", *p) != NULL) 1771 p = skipwhite(p + 1); 1772 1773 if (*p != NUL) 1774 { 1775 /* Only worth concatenating if there is something else than spaces to 1776 * concatenate. */ 1777 n = (int)(p - line) + 1; 1778 if (n < maxlen - 1) 1779 { 1780 vim_memset(buf, ' ', n); 1781 vim_strncpy(buf + n, p, maxlen - 1 - n); 1782 } 1783 } 1784 } 1785 1786 /* 1787 * Structure used for the cookie argument of do_in_runtimepath(). 1788 */ 1789 typedef struct spelload_S 1790 { 1791 char_u sl_lang[MAXWLEN + 1]; /* language name */ 1792 slang_T *sl_slang; /* resulting slang_T struct */ 1793 int sl_nobreak; /* NOBREAK language found */ 1794 } spelload_T; 1795 1796 /* 1797 * Load word list(s) for "lang" from Vim spell file(s). 1798 * "lang" must be the language without the region: e.g., "en". 1799 */ 1800 static void 1801 spell_load_lang(char_u *lang) 1802 { 1803 char_u fname_enc[85]; 1804 int r; 1805 spelload_T sl; 1806 int round; 1807 1808 /* Copy the language name to pass it to spell_load_cb() as a cookie. 1809 * It's truncated when an error is detected. */ 1810 STRCPY(sl.sl_lang, lang); 1811 sl.sl_slang = NULL; 1812 sl.sl_nobreak = FALSE; 1813 1814 /* We may retry when no spell file is found for the language, an 1815 * autocommand may load it then. */ 1816 for (round = 1; round <= 2; ++round) 1817 { 1818 /* 1819 * Find the first spell file for "lang" in 'runtimepath' and load it. 1820 */ 1821 vim_snprintf((char *)fname_enc, sizeof(fname_enc) - 5, 1822 #ifdef VMS 1823 "spell/%s_%s.spl", 1824 #else 1825 "spell/%s.%s.spl", 1826 #endif 1827 lang, spell_enc()); 1828 r = do_in_runtimepath(fname_enc, 0, spell_load_cb, &sl); 1829 1830 if (r == FAIL && *sl.sl_lang != NUL) 1831 { 1832 /* Try loading the ASCII version. */ 1833 vim_snprintf((char *)fname_enc, sizeof(fname_enc) - 5, 1834 #ifdef VMS 1835 "spell/%s_ascii.spl", 1836 #else 1837 "spell/%s.ascii.spl", 1838 #endif 1839 lang); 1840 r = do_in_runtimepath(fname_enc, 0, spell_load_cb, &sl); 1841 1842 if (r == FAIL && *sl.sl_lang != NUL && round == 1 1843 && apply_autocmds(EVENT_SPELLFILEMISSING, lang, 1844 curbuf->b_fname, FALSE, curbuf)) 1845 continue; 1846 break; 1847 } 1848 break; 1849 } 1850 1851 if (r == FAIL) 1852 { 1853 smsg( 1854 #ifdef VMS 1855 _("Warning: Cannot find word list \"%s_%s.spl\" or \"%s_ascii.spl\""), 1856 #else 1857 _("Warning: Cannot find word list \"%s.%s.spl\" or \"%s.ascii.spl\""), 1858 #endif 1859 lang, spell_enc(), lang); 1860 } 1861 else if (sl.sl_slang != NULL) 1862 { 1863 /* At least one file was loaded, now load ALL the additions. */ 1864 STRCPY(fname_enc + STRLEN(fname_enc) - 3, "add.spl"); 1865 do_in_runtimepath(fname_enc, DIP_ALL, spell_load_cb, &sl); 1866 } 1867 } 1868 1869 /* 1870 * Return the encoding used for spell checking: Use 'encoding', except that we 1871 * use "latin1" for "latin9". And limit to 60 characters (just in case). 1872 */ 1873 char_u * 1874 spell_enc(void) 1875 { 1876 1877 if (STRLEN(p_enc) < 60 && STRCMP(p_enc, "iso-8859-15") != 0) 1878 return p_enc; 1879 return (char_u *)"latin1"; 1880 } 1881 1882 /* 1883 * Get the name of the .spl file for the internal wordlist into 1884 * "fname[MAXPATHL]". 1885 */ 1886 static void 1887 int_wordlist_spl(char_u *fname) 1888 { 1889 vim_snprintf((char *)fname, MAXPATHL, SPL_FNAME_TMPL, 1890 int_wordlist, spell_enc()); 1891 } 1892 1893 /* 1894 * Allocate a new slang_T for language "lang". "lang" can be NULL. 1895 * Caller must fill "sl_next". 1896 */ 1897 slang_T * 1898 slang_alloc(char_u *lang) 1899 { 1900 slang_T *lp; 1901 1902 lp = (slang_T *)alloc_clear(sizeof(slang_T)); 1903 if (lp != NULL) 1904 { 1905 if (lang != NULL) 1906 lp->sl_name = vim_strsave(lang); 1907 ga_init2(&lp->sl_rep, sizeof(fromto_T), 10); 1908 ga_init2(&lp->sl_repsal, sizeof(fromto_T), 10); 1909 lp->sl_compmax = MAXWLEN; 1910 lp->sl_compsylmax = MAXWLEN; 1911 hash_init(&lp->sl_wordcount); 1912 } 1913 1914 return lp; 1915 } 1916 1917 /* 1918 * Free the contents of an slang_T and the structure itself. 1919 */ 1920 void 1921 slang_free(slang_T *lp) 1922 { 1923 vim_free(lp->sl_name); 1924 vim_free(lp->sl_fname); 1925 slang_clear(lp); 1926 vim_free(lp); 1927 } 1928 1929 /* 1930 * Clear an slang_T so that the file can be reloaded. 1931 */ 1932 void 1933 slang_clear(slang_T *lp) 1934 { 1935 garray_T *gap; 1936 fromto_T *ftp; 1937 salitem_T *smp; 1938 int i; 1939 int round; 1940 1941 VIM_CLEAR(lp->sl_fbyts); 1942 VIM_CLEAR(lp->sl_kbyts); 1943 VIM_CLEAR(lp->sl_pbyts); 1944 1945 VIM_CLEAR(lp->sl_fidxs); 1946 VIM_CLEAR(lp->sl_kidxs); 1947 VIM_CLEAR(lp->sl_pidxs); 1948 1949 for (round = 1; round <= 2; ++round) 1950 { 1951 gap = round == 1 ? &lp->sl_rep : &lp->sl_repsal; 1952 while (gap->ga_len > 0) 1953 { 1954 ftp = &((fromto_T *)gap->ga_data)[--gap->ga_len]; 1955 vim_free(ftp->ft_from); 1956 vim_free(ftp->ft_to); 1957 } 1958 ga_clear(gap); 1959 } 1960 1961 gap = &lp->sl_sal; 1962 if (lp->sl_sofo) 1963 { 1964 /* "ga_len" is set to 1 without adding an item for latin1 */ 1965 if (gap->ga_data != NULL) 1966 /* SOFOFROM and SOFOTO items: free lists of wide characters. */ 1967 for (i = 0; i < gap->ga_len; ++i) 1968 vim_free(((int **)gap->ga_data)[i]); 1969 } 1970 else 1971 /* SAL items: free salitem_T items */ 1972 while (gap->ga_len > 0) 1973 { 1974 smp = &((salitem_T *)gap->ga_data)[--gap->ga_len]; 1975 vim_free(smp->sm_lead); 1976 /* Don't free sm_oneof and sm_rules, they point into sm_lead. */ 1977 vim_free(smp->sm_to); 1978 vim_free(smp->sm_lead_w); 1979 vim_free(smp->sm_oneof_w); 1980 vim_free(smp->sm_to_w); 1981 } 1982 ga_clear(gap); 1983 1984 for (i = 0; i < lp->sl_prefixcnt; ++i) 1985 vim_regfree(lp->sl_prefprog[i]); 1986 lp->sl_prefixcnt = 0; 1987 VIM_CLEAR(lp->sl_prefprog); 1988 1989 VIM_CLEAR(lp->sl_info); 1990 1991 VIM_CLEAR(lp->sl_midword); 1992 1993 vim_regfree(lp->sl_compprog); 1994 lp->sl_compprog = NULL; 1995 VIM_CLEAR(lp->sl_comprules); 1996 VIM_CLEAR(lp->sl_compstartflags); 1997 VIM_CLEAR(lp->sl_compallflags); 1998 1999 VIM_CLEAR(lp->sl_syllable); 2000 ga_clear(&lp->sl_syl_items); 2001 2002 ga_clear_strings(&lp->sl_comppat); 2003 2004 hash_clear_all(&lp->sl_wordcount, WC_KEY_OFF); 2005 hash_init(&lp->sl_wordcount); 2006 2007 hash_clear_all(&lp->sl_map_hash, 0); 2008 2009 /* Clear info from .sug file. */ 2010 slang_clear_sug(lp); 2011 2012 lp->sl_compmax = MAXWLEN; 2013 lp->sl_compminlen = 0; 2014 lp->sl_compsylmax = MAXWLEN; 2015 lp->sl_regions[0] = NUL; 2016 } 2017 2018 /* 2019 * Clear the info from the .sug file in "lp". 2020 */ 2021 void 2022 slang_clear_sug(slang_T *lp) 2023 { 2024 VIM_CLEAR(lp->sl_sbyts); 2025 VIM_CLEAR(lp->sl_sidxs); 2026 close_spellbuf(lp->sl_sugbuf); 2027 lp->sl_sugbuf = NULL; 2028 lp->sl_sugloaded = FALSE; 2029 lp->sl_sugtime = 0; 2030 } 2031 2032 /* 2033 * Load one spell file and store the info into a slang_T. 2034 * Invoked through do_in_runtimepath(). 2035 */ 2036 static void 2037 spell_load_cb(char_u *fname, void *cookie) 2038 { 2039 spelload_T *slp = (spelload_T *)cookie; 2040 slang_T *slang; 2041 2042 slang = spell_load_file(fname, slp->sl_lang, NULL, FALSE); 2043 if (slang != NULL) 2044 { 2045 /* When a previously loaded file has NOBREAK also use it for the 2046 * ".add" files. */ 2047 if (slp->sl_nobreak && slang->sl_add) 2048 slang->sl_nobreak = TRUE; 2049 else if (slang->sl_nobreak) 2050 slp->sl_nobreak = TRUE; 2051 2052 slp->sl_slang = slang; 2053 } 2054 } 2055 2056 2057 /* 2058 * Add a word to the hashtable of common words. 2059 * If it's already there then the counter is increased. 2060 */ 2061 void 2062 count_common_word( 2063 slang_T *lp, 2064 char_u *word, 2065 int len, /* word length, -1 for upto NUL */ 2066 int count) /* 1 to count once, 10 to init */ 2067 { 2068 hash_T hash; 2069 hashitem_T *hi; 2070 wordcount_T *wc; 2071 char_u buf[MAXWLEN]; 2072 char_u *p; 2073 2074 if (len == -1) 2075 p = word; 2076 else 2077 { 2078 vim_strncpy(buf, word, len); 2079 p = buf; 2080 } 2081 2082 hash = hash_hash(p); 2083 hi = hash_lookup(&lp->sl_wordcount, p, hash); 2084 if (HASHITEM_EMPTY(hi)) 2085 { 2086 wc = (wordcount_T *)alloc((unsigned)(sizeof(wordcount_T) + STRLEN(p))); 2087 if (wc == NULL) 2088 return; 2089 STRCPY(wc->wc_word, p); 2090 wc->wc_count = count; 2091 hash_add_item(&lp->sl_wordcount, hi, wc->wc_word, hash); 2092 } 2093 else 2094 { 2095 wc = HI2WC(hi); 2096 if ((wc->wc_count += count) < (unsigned)count) /* check for overflow */ 2097 wc->wc_count = MAXWORDCOUNT; 2098 } 2099 } 2100 2101 /* 2102 * Adjust the score of common words. 2103 */ 2104 static int 2105 score_wordcount_adj( 2106 slang_T *slang, 2107 int score, 2108 char_u *word, 2109 int split) /* word was split, less bonus */ 2110 { 2111 hashitem_T *hi; 2112 wordcount_T *wc; 2113 int bonus; 2114 int newscore; 2115 2116 hi = hash_find(&slang->sl_wordcount, word); 2117 if (!HASHITEM_EMPTY(hi)) 2118 { 2119 wc = HI2WC(hi); 2120 if (wc->wc_count < SCORE_THRES2) 2121 bonus = SCORE_COMMON1; 2122 else if (wc->wc_count < SCORE_THRES3) 2123 bonus = SCORE_COMMON2; 2124 else 2125 bonus = SCORE_COMMON3; 2126 if (split) 2127 newscore = score - bonus / 2; 2128 else 2129 newscore = score - bonus; 2130 if (newscore < 0) 2131 return 0; 2132 return newscore; 2133 } 2134 return score; 2135 } 2136 2137 2138 /* 2139 * Return TRUE if byte "n" appears in "str". 2140 * Like strchr() but independent of locale. 2141 */ 2142 int 2143 byte_in_str(char_u *str, int n) 2144 { 2145 char_u *p; 2146 2147 for (p = str; *p != NUL; ++p) 2148 if (*p == n) 2149 return TRUE; 2150 return FALSE; 2151 } 2152 2153 #define SY_MAXLEN 30 2154 typedef struct syl_item_S 2155 { 2156 char_u sy_chars[SY_MAXLEN]; /* the sequence of chars */ 2157 int sy_len; 2158 } syl_item_T; 2159 2160 /* 2161 * Truncate "slang->sl_syllable" at the first slash and put the following items 2162 * in "slang->sl_syl_items". 2163 */ 2164 int 2165 init_syl_tab(slang_T *slang) 2166 { 2167 char_u *p; 2168 char_u *s; 2169 int l; 2170 syl_item_T *syl; 2171 2172 ga_init2(&slang->sl_syl_items, sizeof(syl_item_T), 4); 2173 p = vim_strchr(slang->sl_syllable, '/'); 2174 while (p != NULL) 2175 { 2176 *p++ = NUL; 2177 if (*p == NUL) /* trailing slash */ 2178 break; 2179 s = p; 2180 p = vim_strchr(p, '/'); 2181 if (p == NULL) 2182 l = (int)STRLEN(s); 2183 else 2184 l = (int)(p - s); 2185 if (l >= SY_MAXLEN) 2186 return SP_FORMERROR; 2187 if (ga_grow(&slang->sl_syl_items, 1) == FAIL) 2188 return SP_OTHERERROR; 2189 syl = ((syl_item_T *)slang->sl_syl_items.ga_data) 2190 + slang->sl_syl_items.ga_len++; 2191 vim_strncpy(syl->sy_chars, s, l); 2192 syl->sy_len = l; 2193 } 2194 return OK; 2195 } 2196 2197 /* 2198 * Count the number of syllables in "word". 2199 * When "word" contains spaces the syllables after the last space are counted. 2200 * Returns zero if syllables are not defines. 2201 */ 2202 static int 2203 count_syllables(slang_T *slang, char_u *word) 2204 { 2205 int cnt = 0; 2206 int skip = FALSE; 2207 char_u *p; 2208 int len; 2209 int i; 2210 syl_item_T *syl; 2211 int c; 2212 2213 if (slang->sl_syllable == NULL) 2214 return 0; 2215 2216 for (p = word; *p != NUL; p += len) 2217 { 2218 /* When running into a space reset counter. */ 2219 if (*p == ' ') 2220 { 2221 len = 1; 2222 cnt = 0; 2223 continue; 2224 } 2225 2226 /* Find longest match of syllable items. */ 2227 len = 0; 2228 for (i = 0; i < slang->sl_syl_items.ga_len; ++i) 2229 { 2230 syl = ((syl_item_T *)slang->sl_syl_items.ga_data) + i; 2231 if (syl->sy_len > len 2232 && STRNCMP(p, syl->sy_chars, syl->sy_len) == 0) 2233 len = syl->sy_len; 2234 } 2235 if (len != 0) /* found a match, count syllable */ 2236 { 2237 ++cnt; 2238 skip = FALSE; 2239 } 2240 else 2241 { 2242 /* No recognized syllable item, at least a syllable char then? */ 2243 c = mb_ptr2char(p); 2244 len = (*mb_ptr2len)(p); 2245 if (vim_strchr(slang->sl_syllable, c) == NULL) 2246 skip = FALSE; /* No, search for next syllable */ 2247 else if (!skip) 2248 { 2249 ++cnt; /* Yes, count it */ 2250 skip = TRUE; /* don't count following syllable chars */ 2251 } 2252 } 2253 } 2254 return cnt; 2255 } 2256 2257 /* 2258 * Parse 'spelllang' and set w_s->b_langp accordingly. 2259 * Returns NULL if it's OK, an error message otherwise. 2260 */ 2261 char * 2262 did_set_spelllang(win_T *wp) 2263 { 2264 garray_T ga; 2265 char_u *splp; 2266 char_u *region; 2267 char_u region_cp[3]; 2268 int filename; 2269 int region_mask; 2270 slang_T *slang; 2271 int c; 2272 char_u lang[MAXWLEN + 1]; 2273 char_u spf_name[MAXPATHL]; 2274 int len; 2275 char_u *p; 2276 int round; 2277 char_u *spf; 2278 char_u *use_region = NULL; 2279 int dont_use_region = FALSE; 2280 int nobreak = FALSE; 2281 int i, j; 2282 langp_T *lp, *lp2; 2283 static int recursive = FALSE; 2284 char *ret_msg = NULL; 2285 char_u *spl_copy; 2286 bufref_T bufref; 2287 2288 set_bufref(&bufref, wp->w_buffer); 2289 2290 /* We don't want to do this recursively. May happen when a language is 2291 * not available and the SpellFileMissing autocommand opens a new buffer 2292 * in which 'spell' is set. */ 2293 if (recursive) 2294 return NULL; 2295 recursive = TRUE; 2296 2297 ga_init2(&ga, sizeof(langp_T), 2); 2298 clear_midword(wp); 2299 2300 /* Make a copy of 'spelllang', the SpellFileMissing autocommands may change 2301 * it under our fingers. */ 2302 spl_copy = vim_strsave(wp->w_s->b_p_spl); 2303 if (spl_copy == NULL) 2304 goto theend; 2305 2306 wp->w_s->b_cjk = 0; 2307 2308 /* Loop over comma separated language names. */ 2309 for (splp = spl_copy; *splp != NUL; ) 2310 { 2311 /* Get one language name. */ 2312 copy_option_part(&splp, lang, MAXWLEN, ","); 2313 region = NULL; 2314 len = (int)STRLEN(lang); 2315 2316 if (STRCMP(lang, "cjk") == 0) 2317 { 2318 wp->w_s->b_cjk = 1; 2319 continue; 2320 } 2321 2322 /* If the name ends in ".spl" use it as the name of the spell file. 2323 * If there is a region name let "region" point to it and remove it 2324 * from the name. */ 2325 if (len > 4 && fnamecmp(lang + len - 4, ".spl") == 0) 2326 { 2327 filename = TRUE; 2328 2329 /* Locate a region and remove it from the file name. */ 2330 p = vim_strchr(gettail(lang), '_'); 2331 if (p != NULL && ASCII_ISALPHA(p[1]) && ASCII_ISALPHA(p[2]) 2332 && !ASCII_ISALPHA(p[3])) 2333 { 2334 vim_strncpy(region_cp, p + 1, 2); 2335 mch_memmove(p, p + 3, len - (p - lang) - 2); 2336 region = region_cp; 2337 } 2338 else 2339 dont_use_region = TRUE; 2340 2341 /* Check if we loaded this language before. */ 2342 for (slang = first_lang; slang != NULL; slang = slang->sl_next) 2343 if (fullpathcmp(lang, slang->sl_fname, FALSE) == FPC_SAME) 2344 break; 2345 } 2346 else 2347 { 2348 filename = FALSE; 2349 if (len > 3 && lang[len - 3] == '_') 2350 { 2351 region = lang + len - 2; 2352 len -= 3; 2353 lang[len] = NUL; 2354 } 2355 else 2356 dont_use_region = TRUE; 2357 2358 /* Check if we loaded this language before. */ 2359 for (slang = first_lang; slang != NULL; slang = slang->sl_next) 2360 if (STRICMP(lang, slang->sl_name) == 0) 2361 break; 2362 } 2363 2364 if (region != NULL) 2365 { 2366 /* If the region differs from what was used before then don't 2367 * use it for 'spellfile'. */ 2368 if (use_region != NULL && STRCMP(region, use_region) != 0) 2369 dont_use_region = TRUE; 2370 use_region = region; 2371 } 2372 2373 /* If not found try loading the language now. */ 2374 if (slang == NULL) 2375 { 2376 if (filename) 2377 (void)spell_load_file(lang, lang, NULL, FALSE); 2378 else 2379 { 2380 spell_load_lang(lang); 2381 /* SpellFileMissing autocommands may do anything, including 2382 * destroying the buffer we are using... */ 2383 if (!bufref_valid(&bufref)) 2384 { 2385 ret_msg = N_("E797: SpellFileMissing autocommand deleted buffer"); 2386 goto theend; 2387 } 2388 } 2389 } 2390 2391 /* 2392 * Loop over the languages, there can be several files for "lang". 2393 */ 2394 for (slang = first_lang; slang != NULL; slang = slang->sl_next) 2395 if (filename ? fullpathcmp(lang, slang->sl_fname, FALSE) == FPC_SAME 2396 : STRICMP(lang, slang->sl_name) == 0) 2397 { 2398 region_mask = REGION_ALL; 2399 if (!filename && region != NULL) 2400 { 2401 /* find region in sl_regions */ 2402 c = find_region(slang->sl_regions, region); 2403 if (c == REGION_ALL) 2404 { 2405 if (slang->sl_add) 2406 { 2407 if (*slang->sl_regions != NUL) 2408 /* This addition file is for other regions. */ 2409 region_mask = 0; 2410 } 2411 else 2412 /* This is probably an error. Give a warning and 2413 * accept the words anyway. */ 2414 smsg(_("Warning: region %s not supported"), 2415 region); 2416 } 2417 else 2418 region_mask = 1 << c; 2419 } 2420 2421 if (region_mask != 0) 2422 { 2423 if (ga_grow(&ga, 1) == FAIL) 2424 { 2425 ga_clear(&ga); 2426 ret_msg = e_outofmem; 2427 goto theend; 2428 } 2429 LANGP_ENTRY(ga, ga.ga_len)->lp_slang = slang; 2430 LANGP_ENTRY(ga, ga.ga_len)->lp_region = region_mask; 2431 ++ga.ga_len; 2432 use_midword(slang, wp); 2433 if (slang->sl_nobreak) 2434 nobreak = TRUE; 2435 } 2436 } 2437 } 2438 2439 /* round 0: load int_wordlist, if possible. 2440 * round 1: load first name in 'spellfile'. 2441 * round 2: load second name in 'spellfile. 2442 * etc. */ 2443 spf = curwin->w_s->b_p_spf; 2444 for (round = 0; round == 0 || *spf != NUL; ++round) 2445 { 2446 if (round == 0) 2447 { 2448 /* Internal wordlist, if there is one. */ 2449 if (int_wordlist == NULL) 2450 continue; 2451 int_wordlist_spl(spf_name); 2452 } 2453 else 2454 { 2455 /* One entry in 'spellfile'. */ 2456 copy_option_part(&spf, spf_name, MAXPATHL - 5, ","); 2457 STRCAT(spf_name, ".spl"); 2458 2459 /* If it was already found above then skip it. */ 2460 for (c = 0; c < ga.ga_len; ++c) 2461 { 2462 p = LANGP_ENTRY(ga, c)->lp_slang->sl_fname; 2463 if (p != NULL && fullpathcmp(spf_name, p, FALSE) == FPC_SAME) 2464 break; 2465 } 2466 if (c < ga.ga_len) 2467 continue; 2468 } 2469 2470 /* Check if it was loaded already. */ 2471 for (slang = first_lang; slang != NULL; slang = slang->sl_next) 2472 if (fullpathcmp(spf_name, slang->sl_fname, FALSE) == FPC_SAME) 2473 break; 2474 if (slang == NULL) 2475 { 2476 /* Not loaded, try loading it now. The language name includes the 2477 * region name, the region is ignored otherwise. for int_wordlist 2478 * use an arbitrary name. */ 2479 if (round == 0) 2480 STRCPY(lang, "internal wordlist"); 2481 else 2482 { 2483 vim_strncpy(lang, gettail(spf_name), MAXWLEN); 2484 p = vim_strchr(lang, '.'); 2485 if (p != NULL) 2486 *p = NUL; /* truncate at ".encoding.add" */ 2487 } 2488 slang = spell_load_file(spf_name, lang, NULL, TRUE); 2489 2490 /* If one of the languages has NOBREAK we assume the addition 2491 * files also have this. */ 2492 if (slang != NULL && nobreak) 2493 slang->sl_nobreak = TRUE; 2494 } 2495 if (slang != NULL && ga_grow(&ga, 1) == OK) 2496 { 2497 region_mask = REGION_ALL; 2498 if (use_region != NULL && !dont_use_region) 2499 { 2500 /* find region in sl_regions */ 2501 c = find_region(slang->sl_regions, use_region); 2502 if (c != REGION_ALL) 2503 region_mask = 1 << c; 2504 else if (*slang->sl_regions != NUL) 2505 /* This spell file is for other regions. */ 2506 region_mask = 0; 2507 } 2508 2509 if (region_mask != 0) 2510 { 2511 LANGP_ENTRY(ga, ga.ga_len)->lp_slang = slang; 2512 LANGP_ENTRY(ga, ga.ga_len)->lp_sallang = NULL; 2513 LANGP_ENTRY(ga, ga.ga_len)->lp_replang = NULL; 2514 LANGP_ENTRY(ga, ga.ga_len)->lp_region = region_mask; 2515 ++ga.ga_len; 2516 use_midword(slang, wp); 2517 } 2518 } 2519 } 2520 2521 /* Everything is fine, store the new b_langp value. */ 2522 ga_clear(&wp->w_s->b_langp); 2523 wp->w_s->b_langp = ga; 2524 2525 /* For each language figure out what language to use for sound folding and 2526 * REP items. If the language doesn't support it itself use another one 2527 * with the same name. E.g. for "en-math" use "en". */ 2528 for (i = 0; i < ga.ga_len; ++i) 2529 { 2530 lp = LANGP_ENTRY(ga, i); 2531 2532 /* sound folding */ 2533 if (lp->lp_slang->sl_sal.ga_len > 0) 2534 /* language does sound folding itself */ 2535 lp->lp_sallang = lp->lp_slang; 2536 else 2537 /* find first similar language that does sound folding */ 2538 for (j = 0; j < ga.ga_len; ++j) 2539 { 2540 lp2 = LANGP_ENTRY(ga, j); 2541 if (lp2->lp_slang->sl_sal.ga_len > 0 2542 && STRNCMP(lp->lp_slang->sl_name, 2543 lp2->lp_slang->sl_name, 2) == 0) 2544 { 2545 lp->lp_sallang = lp2->lp_slang; 2546 break; 2547 } 2548 } 2549 2550 /* REP items */ 2551 if (lp->lp_slang->sl_rep.ga_len > 0) 2552 /* language has REP items itself */ 2553 lp->lp_replang = lp->lp_slang; 2554 else 2555 /* find first similar language that has REP items */ 2556 for (j = 0; j < ga.ga_len; ++j) 2557 { 2558 lp2 = LANGP_ENTRY(ga, j); 2559 if (lp2->lp_slang->sl_rep.ga_len > 0 2560 && STRNCMP(lp->lp_slang->sl_name, 2561 lp2->lp_slang->sl_name, 2) == 0) 2562 { 2563 lp->lp_replang = lp2->lp_slang; 2564 break; 2565 } 2566 } 2567 } 2568 2569 theend: 2570 vim_free(spl_copy); 2571 recursive = FALSE; 2572 redraw_win_later(wp, NOT_VALID); 2573 return ret_msg; 2574 } 2575 2576 /* 2577 * Clear the midword characters for buffer "buf". 2578 */ 2579 static void 2580 clear_midword(win_T *wp) 2581 { 2582 vim_memset(wp->w_s->b_spell_ismw, 0, 256); 2583 VIM_CLEAR(wp->w_s->b_spell_ismw_mb); 2584 } 2585 2586 /* 2587 * Use the "sl_midword" field of language "lp" for buffer "buf". 2588 * They add up to any currently used midword characters. 2589 */ 2590 static void 2591 use_midword(slang_T *lp, win_T *wp) 2592 { 2593 char_u *p; 2594 2595 if (lp->sl_midword == NULL) /* there aren't any */ 2596 return; 2597 2598 for (p = lp->sl_midword; *p != NUL; ) 2599 if (has_mbyte) 2600 { 2601 int c, l, n; 2602 char_u *bp; 2603 2604 c = mb_ptr2char(p); 2605 l = (*mb_ptr2len)(p); 2606 if (c < 256 && l <= 2) 2607 wp->w_s->b_spell_ismw[c] = TRUE; 2608 else if (wp->w_s->b_spell_ismw_mb == NULL) 2609 /* First multi-byte char in "b_spell_ismw_mb". */ 2610 wp->w_s->b_spell_ismw_mb = vim_strnsave(p, l); 2611 else 2612 { 2613 /* Append multi-byte chars to "b_spell_ismw_mb". */ 2614 n = (int)STRLEN(wp->w_s->b_spell_ismw_mb); 2615 bp = vim_strnsave(wp->w_s->b_spell_ismw_mb, n + l); 2616 if (bp != NULL) 2617 { 2618 vim_free(wp->w_s->b_spell_ismw_mb); 2619 wp->w_s->b_spell_ismw_mb = bp; 2620 vim_strncpy(bp + n, p, l); 2621 } 2622 } 2623 p += l; 2624 } 2625 else 2626 wp->w_s->b_spell_ismw[*p++] = TRUE; 2627 } 2628 2629 /* 2630 * Find the region "region[2]" in "rp" (points to "sl_regions"). 2631 * Each region is simply stored as the two characters of its name. 2632 * Returns the index if found (first is 0), REGION_ALL if not found. 2633 */ 2634 static int 2635 find_region(char_u *rp, char_u *region) 2636 { 2637 int i; 2638 2639 for (i = 0; ; i += 2) 2640 { 2641 if (rp[i] == NUL) 2642 return REGION_ALL; 2643 if (rp[i] == region[0] && rp[i + 1] == region[1]) 2644 break; 2645 } 2646 return i / 2; 2647 } 2648 2649 /* 2650 * Return case type of word: 2651 * w word 0 2652 * Word WF_ONECAP 2653 * W WORD WF_ALLCAP 2654 * WoRd wOrd WF_KEEPCAP 2655 */ 2656 int 2657 captype( 2658 char_u *word, 2659 char_u *end) /* When NULL use up to NUL byte. */ 2660 { 2661 char_u *p; 2662 int c; 2663 int firstcap; 2664 int allcap; 2665 int past_second = FALSE; /* past second word char */ 2666 2667 /* find first letter */ 2668 for (p = word; !spell_iswordp_nmw(p, curwin); MB_PTR_ADV(p)) 2669 if (end == NULL ? *p == NUL : p >= end) 2670 return 0; /* only non-word characters, illegal word */ 2671 if (has_mbyte) 2672 c = mb_ptr2char_adv(&p); 2673 else 2674 c = *p++; 2675 firstcap = allcap = SPELL_ISUPPER(c); 2676 2677 /* 2678 * Need to check all letters to find a word with mixed upper/lower. 2679 * But a word with an upper char only at start is a ONECAP. 2680 */ 2681 for ( ; end == NULL ? *p != NUL : p < end; MB_PTR_ADV(p)) 2682 if (spell_iswordp_nmw(p, curwin)) 2683 { 2684 c = PTR2CHAR(p); 2685 if (!SPELL_ISUPPER(c)) 2686 { 2687 /* UUl -> KEEPCAP */ 2688 if (past_second && allcap) 2689 return WF_KEEPCAP; 2690 allcap = FALSE; 2691 } 2692 else if (!allcap) 2693 /* UlU -> KEEPCAP */ 2694 return WF_KEEPCAP; 2695 past_second = TRUE; 2696 } 2697 2698 if (allcap) 2699 return WF_ALLCAP; 2700 if (firstcap) 2701 return WF_ONECAP; 2702 return 0; 2703 } 2704 2705 /* 2706 * Like captype() but for a KEEPCAP word add ONECAP if the word starts with a 2707 * capital. So that make_case_word() can turn WOrd into Word. 2708 * Add ALLCAP for "WOrD". 2709 */ 2710 static int 2711 badword_captype(char_u *word, char_u *end) 2712 { 2713 int flags = captype(word, end); 2714 int c; 2715 int l, u; 2716 int first; 2717 char_u *p; 2718 2719 if (flags & WF_KEEPCAP) 2720 { 2721 /* Count the number of UPPER and lower case letters. */ 2722 l = u = 0; 2723 first = FALSE; 2724 for (p = word; p < end; MB_PTR_ADV(p)) 2725 { 2726 c = PTR2CHAR(p); 2727 if (SPELL_ISUPPER(c)) 2728 { 2729 ++u; 2730 if (p == word) 2731 first = TRUE; 2732 } 2733 else 2734 ++l; 2735 } 2736 2737 /* If there are more UPPER than lower case letters suggest an 2738 * ALLCAP word. Otherwise, if the first letter is UPPER then 2739 * suggest ONECAP. Exception: "ALl" most likely should be "All", 2740 * require three upper case letters. */ 2741 if (u > l && u > 2) 2742 flags |= WF_ALLCAP; 2743 else if (first) 2744 flags |= WF_ONECAP; 2745 2746 if (u >= 2 && l >= 2) /* maCARONI maCAroni */ 2747 flags |= WF_MIXCAP; 2748 } 2749 return flags; 2750 } 2751 2752 /* 2753 * Delete the internal wordlist and its .spl file. 2754 */ 2755 void 2756 spell_delete_wordlist(void) 2757 { 2758 char_u fname[MAXPATHL]; 2759 2760 if (int_wordlist != NULL) 2761 { 2762 mch_remove(int_wordlist); 2763 int_wordlist_spl(fname); 2764 mch_remove(fname); 2765 VIM_CLEAR(int_wordlist); 2766 } 2767 } 2768 2769 /* 2770 * Free all languages. 2771 */ 2772 void 2773 spell_free_all(void) 2774 { 2775 slang_T *slang; 2776 buf_T *buf; 2777 2778 /* Go through all buffers and handle 'spelllang'. <VN> */ 2779 FOR_ALL_BUFFERS(buf) 2780 ga_clear(&buf->b_s.b_langp); 2781 2782 while (first_lang != NULL) 2783 { 2784 slang = first_lang; 2785 first_lang = slang->sl_next; 2786 slang_free(slang); 2787 } 2788 2789 spell_delete_wordlist(); 2790 2791 VIM_CLEAR(repl_to); 2792 VIM_CLEAR(repl_from); 2793 } 2794 2795 /* 2796 * Clear all spelling tables and reload them. 2797 * Used after 'encoding' is set and when ":mkspell" was used. 2798 */ 2799 void 2800 spell_reload(void) 2801 { 2802 win_T *wp; 2803 2804 /* Initialize the table for spell_iswordp(). */ 2805 init_spell_chartab(); 2806 2807 /* Unload all allocated memory. */ 2808 spell_free_all(); 2809 2810 /* Go through all buffers and handle 'spelllang'. */ 2811 FOR_ALL_WINDOWS(wp) 2812 { 2813 /* Only load the wordlists when 'spelllang' is set and there is a 2814 * window for this buffer in which 'spell' is set. */ 2815 if (*wp->w_s->b_p_spl != NUL) 2816 { 2817 if (wp->w_p_spell) 2818 { 2819 (void)did_set_spelllang(wp); 2820 break; 2821 } 2822 } 2823 } 2824 } 2825 2826 /* 2827 * Opposite of offset2bytes(). 2828 * "pp" points to the bytes and is advanced over it. 2829 * Returns the offset. 2830 */ 2831 static int 2832 bytes2offset(char_u **pp) 2833 { 2834 char_u *p = *pp; 2835 int nr; 2836 int c; 2837 2838 c = *p++; 2839 if ((c & 0x80) == 0x00) /* 1 byte */ 2840 { 2841 nr = c - 1; 2842 } 2843 else if ((c & 0xc0) == 0x80) /* 2 bytes */ 2844 { 2845 nr = (c & 0x3f) - 1; 2846 nr = nr * 255 + (*p++ - 1); 2847 } 2848 else if ((c & 0xe0) == 0xc0) /* 3 bytes */ 2849 { 2850 nr = (c & 0x1f) - 1; 2851 nr = nr * 255 + (*p++ - 1); 2852 nr = nr * 255 + (*p++ - 1); 2853 } 2854 else /* 4 bytes */ 2855 { 2856 nr = (c & 0x0f) - 1; 2857 nr = nr * 255 + (*p++ - 1); 2858 nr = nr * 255 + (*p++ - 1); 2859 nr = nr * 255 + (*p++ - 1); 2860 } 2861 2862 *pp = p; 2863 return nr; 2864 } 2865 2866 2867 /* 2868 * Open a spell buffer. This is a nameless buffer that is not in the buffer 2869 * list and only contains text lines. Can use a swapfile to reduce memory 2870 * use. 2871 * Most other fields are invalid! Esp. watch out for string options being 2872 * NULL and there is no undo info. 2873 * Returns NULL when out of memory. 2874 */ 2875 buf_T * 2876 open_spellbuf(void) 2877 { 2878 buf_T *buf; 2879 2880 buf = (buf_T *)alloc_clear(sizeof(buf_T)); 2881 if (buf != NULL) 2882 { 2883 buf->b_spell = TRUE; 2884 buf->b_p_swf = TRUE; /* may create a swap file */ 2885 #ifdef FEAT_CRYPT 2886 buf->b_p_key = empty_option; 2887 #endif 2888 ml_open(buf); 2889 ml_open_file(buf); /* create swap file now */ 2890 } 2891 return buf; 2892 } 2893 2894 /* 2895 * Close the buffer used for spell info. 2896 */ 2897 void 2898 close_spellbuf(buf_T *buf) 2899 { 2900 if (buf != NULL) 2901 { 2902 ml_close(buf, TRUE); 2903 vim_free(buf); 2904 } 2905 } 2906 2907 /* 2908 * Init the chartab used for spelling for ASCII. 2909 * EBCDIC is not supported! 2910 */ 2911 void 2912 clear_spell_chartab(spelltab_T *sp) 2913 { 2914 int i; 2915 2916 /* Init everything to FALSE. */ 2917 vim_memset(sp->st_isw, FALSE, sizeof(sp->st_isw)); 2918 vim_memset(sp->st_isu, FALSE, sizeof(sp->st_isu)); 2919 for (i = 0; i < 256; ++i) 2920 { 2921 sp->st_fold[i] = i; 2922 sp->st_upper[i] = i; 2923 } 2924 2925 /* We include digits. A word shouldn't start with a digit, but handling 2926 * that is done separately. */ 2927 for (i = '0'; i <= '9'; ++i) 2928 sp->st_isw[i] = TRUE; 2929 for (i = 'A'; i <= 'Z'; ++i) 2930 { 2931 sp->st_isw[i] = TRUE; 2932 sp->st_isu[i] = TRUE; 2933 sp->st_fold[i] = i + 0x20; 2934 } 2935 for (i = 'a'; i <= 'z'; ++i) 2936 { 2937 sp->st_isw[i] = TRUE; 2938 sp->st_upper[i] = i - 0x20; 2939 } 2940 } 2941 2942 /* 2943 * Init the chartab used for spelling. Only depends on 'encoding'. 2944 * Called once while starting up and when 'encoding' changes. 2945 * The default is to use isalpha(), but the spell file should define the word 2946 * characters to make it possible that 'encoding' differs from the current 2947 * locale. For utf-8 we don't use isalpha() but our own functions. 2948 */ 2949 void 2950 init_spell_chartab(void) 2951 { 2952 int i; 2953 2954 did_set_spelltab = FALSE; 2955 clear_spell_chartab(&spelltab); 2956 if (enc_dbcs) 2957 { 2958 /* DBCS: assume double-wide characters are word characters. */ 2959 for (i = 128; i <= 255; ++i) 2960 if (MB_BYTE2LEN(i) == 2) 2961 spelltab.st_isw[i] = TRUE; 2962 } 2963 else if (enc_utf8) 2964 { 2965 for (i = 128; i < 256; ++i) 2966 { 2967 int f = utf_fold(i); 2968 int u = utf_toupper(i); 2969 2970 spelltab.st_isu[i] = utf_isupper(i); 2971 spelltab.st_isw[i] = spelltab.st_isu[i] || utf_islower(i); 2972 /* The folded/upper-cased value is different between latin1 and 2973 * utf8 for 0xb5, causing E763 for no good reason. Use the latin1 2974 * value for utf-8 to avoid this. */ 2975 spelltab.st_fold[i] = (f < 256) ? f : i; 2976 spelltab.st_upper[i] = (u < 256) ? u : i; 2977 } 2978 } 2979 else 2980 { 2981 /* Rough guess: use locale-dependent library functions. */ 2982 for (i = 128; i < 256; ++i) 2983 { 2984 if (MB_ISUPPER(i)) 2985 { 2986 spelltab.st_isw[i] = TRUE; 2987 spelltab.st_isu[i] = TRUE; 2988 spelltab.st_fold[i] = MB_TOLOWER(i); 2989 } 2990 else if (MB_ISLOWER(i)) 2991 { 2992 spelltab.st_isw[i] = TRUE; 2993 spelltab.st_upper[i] = MB_TOUPPER(i); 2994 } 2995 } 2996 } 2997 } 2998 2999 3000 /* 3001 * Return TRUE if "p" points to a word character. 3002 * As a special case we see "midword" characters as word character when it is 3003 * followed by a word character. This finds they'there but not 'they there'. 3004 * Thus this only works properly when past the first character of the word. 3005 */ 3006 static int 3007 spell_iswordp( 3008 char_u *p, 3009 win_T *wp) /* buffer used */ 3010 { 3011 char_u *s; 3012 int l; 3013 int c; 3014 3015 if (has_mbyte) 3016 { 3017 l = MB_PTR2LEN(p); 3018 s = p; 3019 if (l == 1) 3020 { 3021 /* be quick for ASCII */ 3022 if (wp->w_s->b_spell_ismw[*p]) 3023 s = p + 1; /* skip a mid-word character */ 3024 } 3025 else 3026 { 3027 c = mb_ptr2char(p); 3028 if (c < 256 ? wp->w_s->b_spell_ismw[c] 3029 : (wp->w_s->b_spell_ismw_mb != NULL 3030 && vim_strchr(wp->w_s->b_spell_ismw_mb, c) != NULL)) 3031 s = p + l; 3032 } 3033 3034 c = mb_ptr2char(s); 3035 if (c > 255) 3036 return spell_mb_isword_class(mb_get_class(s), wp); 3037 return spelltab.st_isw[c]; 3038 } 3039 3040 return spelltab.st_isw[wp->w_s->b_spell_ismw[*p] ? p[1] : p[0]]; 3041 } 3042 3043 /* 3044 * Return TRUE if "p" points to a word character. 3045 * Unlike spell_iswordp() this doesn't check for "midword" characters. 3046 */ 3047 int 3048 spell_iswordp_nmw(char_u *p, win_T *wp) 3049 { 3050 int c; 3051 3052 if (has_mbyte) 3053 { 3054 c = mb_ptr2char(p); 3055 if (c > 255) 3056 return spell_mb_isword_class(mb_get_class(p), wp); 3057 return spelltab.st_isw[c]; 3058 } 3059 return spelltab.st_isw[*p]; 3060 } 3061 3062 /* 3063 * Return TRUE if word class indicates a word character. 3064 * Only for characters above 255. 3065 * Unicode subscript and superscript are not considered word characters. 3066 * See also dbcs_class() and utf_class() in mbyte.c. 3067 */ 3068 static int 3069 spell_mb_isword_class(int cl, win_T *wp) 3070 { 3071 if (wp->w_s->b_cjk) 3072 /* East Asian characters are not considered word characters. */ 3073 return cl == 2 || cl == 0x2800; 3074 return cl >= 2 && cl != 0x2070 && cl != 0x2080; 3075 } 3076 3077 /* 3078 * Return TRUE if "p" points to a word character. 3079 * Wide version of spell_iswordp(). 3080 */ 3081 static int 3082 spell_iswordp_w(int *p, win_T *wp) 3083 { 3084 int *s; 3085 3086 if (*p < 256 ? wp->w_s->b_spell_ismw[*p] 3087 : (wp->w_s->b_spell_ismw_mb != NULL 3088 && vim_strchr(wp->w_s->b_spell_ismw_mb, *p) != NULL)) 3089 s = p + 1; 3090 else 3091 s = p; 3092 3093 if (*s > 255) 3094 { 3095 if (enc_utf8) 3096 return spell_mb_isword_class(utf_class(*s), wp); 3097 if (enc_dbcs) 3098 return spell_mb_isword_class( 3099 dbcs_class((unsigned)*s >> 8, *s & 0xff), wp); 3100 return 0; 3101 } 3102 return spelltab.st_isw[*s]; 3103 } 3104 3105 /* 3106 * Case-fold "str[len]" into "buf[buflen]". The result is NUL terminated. 3107 * Uses the character definitions from the .spl file. 3108 * When using a multi-byte 'encoding' the length may change! 3109 * Returns FAIL when something wrong. 3110 */ 3111 int 3112 spell_casefold( 3113 char_u *str, 3114 int len, 3115 char_u *buf, 3116 int buflen) 3117 { 3118 int i; 3119 3120 if (len >= buflen) 3121 { 3122 buf[0] = NUL; 3123 return FAIL; /* result will not fit */ 3124 } 3125 3126 if (has_mbyte) 3127 { 3128 int outi = 0; 3129 char_u *p; 3130 int c; 3131 3132 /* Fold one character at a time. */ 3133 for (p = str; p < str + len; ) 3134 { 3135 if (outi + MB_MAXBYTES > buflen) 3136 { 3137 buf[outi] = NUL; 3138 return FAIL; 3139 } 3140 c = mb_cptr2char_adv(&p); 3141 outi += mb_char2bytes(SPELL_TOFOLD(c), buf + outi); 3142 } 3143 buf[outi] = NUL; 3144 } 3145 else 3146 { 3147 /* Be quick for non-multibyte encodings. */ 3148 for (i = 0; i < len; ++i) 3149 buf[i] = spelltab.st_fold[str[i]]; 3150 buf[i] = NUL; 3151 } 3152 3153 return OK; 3154 } 3155 3156 /* values for sps_flags */ 3157 #define SPS_BEST 1 3158 #define SPS_FAST 2 3159 #define SPS_DOUBLE 4 3160 3161 static int sps_flags = SPS_BEST; /* flags from 'spellsuggest' */ 3162 static int sps_limit = 9999; /* max nr of suggestions given */ 3163 3164 /* 3165 * Check the 'spellsuggest' option. Return FAIL if it's wrong. 3166 * Sets "sps_flags" and "sps_limit". 3167 */ 3168 int 3169 spell_check_sps(void) 3170 { 3171 char_u *p; 3172 char_u *s; 3173 char_u buf[MAXPATHL]; 3174 int f; 3175 3176 sps_flags = 0; 3177 sps_limit = 9999; 3178 3179 for (p = p_sps; *p != NUL; ) 3180 { 3181 copy_option_part(&p, buf, MAXPATHL, ","); 3182 3183 f = 0; 3184 if (VIM_ISDIGIT(*buf)) 3185 { 3186 s = buf; 3187 sps_limit = getdigits(&s); 3188 if (*s != NUL && !VIM_ISDIGIT(*s)) 3189 f = -1; 3190 } 3191 else if (STRCMP(buf, "best") == 0) 3192 f = SPS_BEST; 3193 else if (STRCMP(buf, "fast") == 0) 3194 f = SPS_FAST; 3195 else if (STRCMP(buf, "double") == 0) 3196 f = SPS_DOUBLE; 3197 else if (STRNCMP(buf, "expr:", 5) != 0 3198 && STRNCMP(buf, "file:", 5) != 0) 3199 f = -1; 3200 3201 if (f == -1 || (sps_flags != 0 && f != 0)) 3202 { 3203 sps_flags = SPS_BEST; 3204 sps_limit = 9999; 3205 return FAIL; 3206 } 3207 if (f != 0) 3208 sps_flags = f; 3209 } 3210 3211 if (sps_flags == 0) 3212 sps_flags = SPS_BEST; 3213 3214 return OK; 3215 } 3216 3217 /* 3218 * "z=": Find badly spelled word under or after the cursor. 3219 * Give suggestions for the properly spelled word. 3220 * In Visual mode use the highlighted word as the bad word. 3221 * When "count" is non-zero use that suggestion. 3222 */ 3223 void 3224 spell_suggest(int count) 3225 { 3226 char_u *line; 3227 pos_T prev_cursor = curwin->w_cursor; 3228 char_u wcopy[MAXWLEN + 2]; 3229 char_u *p; 3230 int i; 3231 int c; 3232 suginfo_T sug; 3233 suggest_T *stp; 3234 int mouse_used; 3235 int need_cap; 3236 int limit; 3237 int selected = count; 3238 int badlen = 0; 3239 int msg_scroll_save = msg_scroll; 3240 3241 if (no_spell_checking(curwin)) 3242 return; 3243 3244 if (VIsual_active) 3245 { 3246 /* Use the Visually selected text as the bad word. But reject 3247 * a multi-line selection. */ 3248 if (curwin->w_cursor.lnum != VIsual.lnum) 3249 { 3250 vim_beep(BO_SPELL); 3251 return; 3252 } 3253 badlen = (int)curwin->w_cursor.col - (int)VIsual.col; 3254 if (badlen < 0) 3255 badlen = -badlen; 3256 else 3257 curwin->w_cursor.col = VIsual.col; 3258 ++badlen; 3259 end_visual_mode(); 3260 } 3261 /* Find the start of the badly spelled word. */ 3262 else if (spell_move_to(curwin, FORWARD, TRUE, TRUE, NULL) == 0 3263 || curwin->w_cursor.col > prev_cursor.col) 3264 { 3265 /* No bad word or it starts after the cursor: use the word under the 3266 * cursor. */ 3267 curwin->w_cursor = prev_cursor; 3268 line = ml_get_curline(); 3269 p = line + curwin->w_cursor.col; 3270 /* Backup to before start of word. */ 3271 while (p > line && spell_iswordp_nmw(p, curwin)) 3272 MB_PTR_BACK(line, p); 3273 /* Forward to start of word. */ 3274 while (*p != NUL && !spell_iswordp_nmw(p, curwin)) 3275 MB_PTR_ADV(p); 3276 3277 if (!spell_iswordp_nmw(p, curwin)) /* No word found. */ 3278 { 3279 beep_flush(); 3280 return; 3281 } 3282 curwin->w_cursor.col = (colnr_T)(p - line); 3283 } 3284 3285 /* Get the word and its length. */ 3286 3287 /* Figure out if the word should be capitalised. */ 3288 need_cap = check_need_cap(curwin->w_cursor.lnum, curwin->w_cursor.col); 3289 3290 /* Make a copy of current line since autocommands may free the line. */ 3291 line = vim_strsave(ml_get_curline()); 3292 if (line == NULL) 3293 goto skip; 3294 3295 /* Get the list of suggestions. Limit to 'lines' - 2 or the number in 3296 * 'spellsuggest', whatever is smaller. */ 3297 if (sps_limit > (int)Rows - 2) 3298 limit = (int)Rows - 2; 3299 else 3300 limit = sps_limit; 3301 spell_find_suggest(line + curwin->w_cursor.col, badlen, &sug, limit, 3302 TRUE, need_cap, TRUE); 3303 3304 if (sug.su_ga.ga_len == 0) 3305 msg(_("Sorry, no suggestions")); 3306 else if (count > 0) 3307 { 3308 if (count > sug.su_ga.ga_len) 3309 smsg(_("Sorry, only %ld suggestions"), 3310 (long)sug.su_ga.ga_len); 3311 } 3312 else 3313 { 3314 VIM_CLEAR(repl_from); 3315 VIM_CLEAR(repl_to); 3316 3317 #ifdef FEAT_RIGHTLEFT 3318 /* When 'rightleft' is set the list is drawn right-left. */ 3319 cmdmsg_rl = curwin->w_p_rl; 3320 if (cmdmsg_rl) 3321 msg_col = Columns - 1; 3322 #endif 3323 3324 /* List the suggestions. */ 3325 msg_start(); 3326 msg_row = Rows - 1; /* for when 'cmdheight' > 1 */ 3327 lines_left = Rows; /* avoid more prompt */ 3328 vim_snprintf((char *)IObuff, IOSIZE, _("Change \"%.*s\" to:"), 3329 sug.su_badlen, sug.su_badptr); 3330 #ifdef FEAT_RIGHTLEFT 3331 if (cmdmsg_rl && STRNCMP(IObuff, "Change", 6) == 0) 3332 { 3333 /* And now the rabbit from the high hat: Avoid showing the 3334 * untranslated message rightleft. */ 3335 vim_snprintf((char *)IObuff, IOSIZE, ":ot \"%.*s\" egnahC", 3336 sug.su_badlen, sug.su_badptr); 3337 } 3338 #endif 3339 msg_puts((char *)IObuff); 3340 msg_clr_eos(); 3341 msg_putchar('\n'); 3342 3343 msg_scroll = TRUE; 3344 for (i = 0; i < sug.su_ga.ga_len; ++i) 3345 { 3346 stp = &SUG(sug.su_ga, i); 3347 3348 /* The suggested word may replace only part of the bad word, add 3349 * the not replaced part. */ 3350 vim_strncpy(wcopy, stp->st_word, MAXWLEN); 3351 if (sug.su_badlen > stp->st_orglen) 3352 vim_strncpy(wcopy + stp->st_wordlen, 3353 sug.su_badptr + stp->st_orglen, 3354 sug.su_badlen - stp->st_orglen); 3355 vim_snprintf((char *)IObuff, IOSIZE, "%2d", i + 1); 3356 #ifdef FEAT_RIGHTLEFT 3357 if (cmdmsg_rl) 3358 rl_mirror(IObuff); 3359 #endif 3360 msg_puts((char *)IObuff); 3361 3362 vim_snprintf((char *)IObuff, IOSIZE, " \"%s\"", wcopy); 3363 msg_puts((char *)IObuff); 3364 3365 /* The word may replace more than "su_badlen". */ 3366 if (sug.su_badlen < stp->st_orglen) 3367 { 3368 vim_snprintf((char *)IObuff, IOSIZE, _(" < \"%.*s\""), 3369 stp->st_orglen, sug.su_badptr); 3370 msg_puts((char *)IObuff); 3371 } 3372 3373 if (p_verbose > 0) 3374 { 3375 /* Add the score. */ 3376 if (sps_flags & (SPS_DOUBLE | SPS_BEST)) 3377 vim_snprintf((char *)IObuff, IOSIZE, " (%s%d - %d)", 3378 stp->st_salscore ? "s " : "", 3379 stp->st_score, stp->st_altscore); 3380 else 3381 vim_snprintf((char *)IObuff, IOSIZE, " (%d)", 3382 stp->st_score); 3383 #ifdef FEAT_RIGHTLEFT 3384 if (cmdmsg_rl) 3385 /* Mirror the numbers, but keep the leading space. */ 3386 rl_mirror(IObuff + 1); 3387 #endif 3388 msg_advance(30); 3389 msg_puts((char *)IObuff); 3390 } 3391 msg_putchar('\n'); 3392 } 3393 3394 #ifdef FEAT_RIGHTLEFT 3395 cmdmsg_rl = FALSE; 3396 msg_col = 0; 3397 #endif 3398 /* Ask for choice. */ 3399 selected = prompt_for_number(&mouse_used); 3400 if (mouse_used) 3401 selected -= lines_left; 3402 lines_left = Rows; /* avoid more prompt */ 3403 /* don't delay for 'smd' in normal_cmd() */ 3404 msg_scroll = msg_scroll_save; 3405 } 3406 3407 if (selected > 0 && selected <= sug.su_ga.ga_len && u_save_cursor() == OK) 3408 { 3409 /* Save the from and to text for :spellrepall. */ 3410 stp = &SUG(sug.su_ga, selected - 1); 3411 if (sug.su_badlen > stp->st_orglen) 3412 { 3413 /* Replacing less than "su_badlen", append the remainder to 3414 * repl_to. */ 3415 repl_from = vim_strnsave(sug.su_badptr, sug.su_badlen); 3416 vim_snprintf((char *)IObuff, IOSIZE, "%s%.*s", stp->st_word, 3417 sug.su_badlen - stp->st_orglen, 3418 sug.su_badptr + stp->st_orglen); 3419 repl_to = vim_strsave(IObuff); 3420 } 3421 else 3422 { 3423 /* Replacing su_badlen or more, use the whole word. */ 3424 repl_from = vim_strnsave(sug.su_badptr, stp->st_orglen); 3425 repl_to = vim_strsave(stp->st_word); 3426 } 3427 3428 /* Replace the word. */ 3429 p = alloc((unsigned)STRLEN(line) - stp->st_orglen 3430 + stp->st_wordlen + 1); 3431 if (p != NULL) 3432 { 3433 c = (int)(sug.su_badptr - line); 3434 mch_memmove(p, line, c); 3435 STRCPY(p + c, stp->st_word); 3436 STRCAT(p, sug.su_badptr + stp->st_orglen); 3437 ml_replace(curwin->w_cursor.lnum, p, FALSE); 3438 curwin->w_cursor.col = c; 3439 3440 /* For redo we use a change-word command. */ 3441 ResetRedobuff(); 3442 AppendToRedobuff((char_u *)"ciw"); 3443 AppendToRedobuffLit(p + c, 3444 stp->st_wordlen + sug.su_badlen - stp->st_orglen); 3445 AppendCharToRedobuff(ESC); 3446 3447 /* After this "p" may be invalid. */ 3448 changed_bytes(curwin->w_cursor.lnum, c); 3449 } 3450 } 3451 else 3452 curwin->w_cursor = prev_cursor; 3453 3454 spell_find_cleanup(&sug); 3455 skip: 3456 vim_free(line); 3457 } 3458 3459 /* 3460 * Check if the word at line "lnum" column "col" is required to start with a 3461 * capital. This uses 'spellcapcheck' of the current buffer. 3462 */ 3463 static int 3464 check_need_cap(linenr_T lnum, colnr_T col) 3465 { 3466 int need_cap = FALSE; 3467 char_u *line; 3468 char_u *line_copy = NULL; 3469 char_u *p; 3470 colnr_T endcol; 3471 regmatch_T regmatch; 3472 3473 if (curwin->w_s->b_cap_prog == NULL) 3474 return FALSE; 3475 3476 line = ml_get_curline(); 3477 endcol = 0; 3478 if (getwhitecols(line) >= (int)col) 3479 { 3480 /* At start of line, check if previous line is empty or sentence 3481 * ends there. */ 3482 if (lnum == 1) 3483 need_cap = TRUE; 3484 else 3485 { 3486 line = ml_get(lnum - 1); 3487 if (*skipwhite(line) == NUL) 3488 need_cap = TRUE; 3489 else 3490 { 3491 /* Append a space in place of the line break. */ 3492 line_copy = concat_str(line, (char_u *)" "); 3493 line = line_copy; 3494 endcol = (colnr_T)STRLEN(line); 3495 } 3496 } 3497 } 3498 else 3499 endcol = col; 3500 3501 if (endcol > 0) 3502 { 3503 /* Check if sentence ends before the bad word. */ 3504 regmatch.regprog = curwin->w_s->b_cap_prog; 3505 regmatch.rm_ic = FALSE; 3506 p = line + endcol; 3507 for (;;) 3508 { 3509 MB_PTR_BACK(line, p); 3510 if (p == line || spell_iswordp_nmw(p, curwin)) 3511 break; 3512 if (vim_regexec(®match, p, 0) 3513 && regmatch.endp[0] == line + endcol) 3514 { 3515 need_cap = TRUE; 3516 break; 3517 } 3518 } 3519 curwin->w_s->b_cap_prog = regmatch.regprog; 3520 } 3521 3522 vim_free(line_copy); 3523 3524 return need_cap; 3525 } 3526 3527 3528 /* 3529 * ":spellrepall" 3530 */ 3531 void 3532 ex_spellrepall(exarg_T *eap UNUSED) 3533 { 3534 pos_T pos = curwin->w_cursor; 3535 char_u *frompat; 3536 int addlen; 3537 char_u *line; 3538 char_u *p; 3539 int save_ws = p_ws; 3540 linenr_T prev_lnum = 0; 3541 3542 if (repl_from == NULL || repl_to == NULL) 3543 { 3544 emsg(_("E752: No previous spell replacement")); 3545 return; 3546 } 3547 addlen = (int)(STRLEN(repl_to) - STRLEN(repl_from)); 3548 3549 frompat = alloc((unsigned)STRLEN(repl_from) + 7); 3550 if (frompat == NULL) 3551 return; 3552 sprintf((char *)frompat, "\\V\\<%s\\>", repl_from); 3553 p_ws = FALSE; 3554 3555 sub_nsubs = 0; 3556 sub_nlines = 0; 3557 curwin->w_cursor.lnum = 0; 3558 while (!got_int) 3559 { 3560 if (do_search(NULL, '/', frompat, 1L, SEARCH_KEEP, NULL, NULL) == 0 3561 || u_save_cursor() == FAIL) 3562 break; 3563 3564 /* Only replace when the right word isn't there yet. This happens 3565 * when changing "etc" to "etc.". */ 3566 line = ml_get_curline(); 3567 if (addlen <= 0 || STRNCMP(line + curwin->w_cursor.col, 3568 repl_to, STRLEN(repl_to)) != 0) 3569 { 3570 p = alloc((unsigned)STRLEN(line) + addlen + 1); 3571 if (p == NULL) 3572 break; 3573 mch_memmove(p, line, curwin->w_cursor.col); 3574 STRCPY(p + curwin->w_cursor.col, repl_to); 3575 STRCAT(p, line + curwin->w_cursor.col + STRLEN(repl_from)); 3576 ml_replace(curwin->w_cursor.lnum, p, FALSE); 3577 changed_bytes(curwin->w_cursor.lnum, curwin->w_cursor.col); 3578 3579 if (curwin->w_cursor.lnum != prev_lnum) 3580 { 3581 ++sub_nlines; 3582 prev_lnum = curwin->w_cursor.lnum; 3583 } 3584 ++sub_nsubs; 3585 } 3586 curwin->w_cursor.col += (colnr_T)STRLEN(repl_to); 3587 } 3588 3589 p_ws = save_ws; 3590 curwin->w_cursor = pos; 3591 vim_free(frompat); 3592 3593 if (sub_nsubs == 0) 3594 semsg(_("E753: Not found: %s"), repl_from); 3595 else 3596 do_sub_msg(FALSE); 3597 } 3598 3599 /* 3600 * Find spell suggestions for "word". Return them in the growarray "*gap" as 3601 * a list of allocated strings. 3602 */ 3603 void 3604 spell_suggest_list( 3605 garray_T *gap, 3606 char_u *word, 3607 int maxcount, /* maximum nr of suggestions */ 3608 int need_cap, /* 'spellcapcheck' matched */ 3609 int interactive) 3610 { 3611 suginfo_T sug; 3612 int i; 3613 suggest_T *stp; 3614 char_u *wcopy; 3615 3616 spell_find_suggest(word, 0, &sug, maxcount, FALSE, need_cap, interactive); 3617 3618 /* Make room in "gap". */ 3619 ga_init2(gap, sizeof(char_u *), sug.su_ga.ga_len + 1); 3620 if (ga_grow(gap, sug.su_ga.ga_len) == OK) 3621 { 3622 for (i = 0; i < sug.su_ga.ga_len; ++i) 3623 { 3624 stp = &SUG(sug.su_ga, i); 3625 3626 /* The suggested word may replace only part of "word", add the not 3627 * replaced part. */ 3628 wcopy = alloc(stp->st_wordlen 3629 + (unsigned)STRLEN(sug.su_badptr + stp->st_orglen) + 1); 3630 if (wcopy == NULL) 3631 break; 3632 STRCPY(wcopy, stp->st_word); 3633 STRCPY(wcopy + stp->st_wordlen, sug.su_badptr + stp->st_orglen); 3634 ((char_u **)gap->ga_data)[gap->ga_len++] = wcopy; 3635 } 3636 } 3637 3638 spell_find_cleanup(&sug); 3639 } 3640 3641 /* 3642 * Find spell suggestions for the word at the start of "badptr". 3643 * Return the suggestions in "su->su_ga". 3644 * The maximum number of suggestions is "maxcount". 3645 * Note: does use info for the current window. 3646 * This is based on the mechanisms of Aspell, but completely reimplemented. 3647 */ 3648 static void 3649 spell_find_suggest( 3650 char_u *badptr, 3651 int badlen, /* length of bad word or 0 if unknown */ 3652 suginfo_T *su, 3653 int maxcount, 3654 int banbadword, /* don't include badword in suggestions */ 3655 int need_cap, /* word should start with capital */ 3656 int interactive) 3657 { 3658 hlf_T attr = HLF_COUNT; 3659 char_u buf[MAXPATHL]; 3660 char_u *p; 3661 int do_combine = FALSE; 3662 char_u *sps_copy; 3663 #ifdef FEAT_EVAL 3664 static int expr_busy = FALSE; 3665 #endif 3666 int c; 3667 int i; 3668 langp_T *lp; 3669 3670 /* 3671 * Set the info in "*su". 3672 */ 3673 vim_memset(su, 0, sizeof(suginfo_T)); 3674 ga_init2(&su->su_ga, (int)sizeof(suggest_T), 10); 3675 ga_init2(&su->su_sga, (int)sizeof(suggest_T), 10); 3676 if (*badptr == NUL) 3677 return; 3678 hash_init(&su->su_banned); 3679 3680 su->su_badptr = badptr; 3681 if (badlen != 0) 3682 su->su_badlen = badlen; 3683 else 3684 su->su_badlen = spell_check(curwin, su->su_badptr, &attr, NULL, FALSE); 3685 su->su_maxcount = maxcount; 3686 su->su_maxscore = SCORE_MAXINIT; 3687 3688 if (su->su_badlen >= MAXWLEN) 3689 su->su_badlen = MAXWLEN - 1; /* just in case */ 3690 vim_strncpy(su->su_badword, su->su_badptr, su->su_badlen); 3691 (void)spell_casefold(su->su_badptr, su->su_badlen, 3692 su->su_fbadword, MAXWLEN); 3693 /* TODO: make this work if the case-folded text is longer than the original 3694 * text. Currently an illegal byte causes wrong pointer computations. */ 3695 su->su_fbadword[su->su_badlen] = NUL; 3696 3697 /* get caps flags for bad word */ 3698 su->su_badflags = badword_captype(su->su_badptr, 3699 su->su_badptr + su->su_badlen); 3700 if (need_cap) 3701 su->su_badflags |= WF_ONECAP; 3702 3703 /* Find the default language for sound folding. We simply use the first 3704 * one in 'spelllang' that supports sound folding. That's good for when 3705 * using multiple files for one language, it's not that bad when mixing 3706 * languages (e.g., "pl,en"). */ 3707 for (i = 0; i < curbuf->b_s.b_langp.ga_len; ++i) 3708 { 3709 lp = LANGP_ENTRY(curbuf->b_s.b_langp, i); 3710 if (lp->lp_sallang != NULL) 3711 { 3712 su->su_sallang = lp->lp_sallang; 3713 break; 3714 } 3715 } 3716 3717 /* Soundfold the bad word with the default sound folding, so that we don't 3718 * have to do this many times. */ 3719 if (su->su_sallang != NULL) 3720 spell_soundfold(su->su_sallang, su->su_fbadword, TRUE, 3721 su->su_sal_badword); 3722 3723 /* If the word is not capitalised and spell_check() doesn't consider the 3724 * word to be bad then it might need to be capitalised. Add a suggestion 3725 * for that. */ 3726 c = PTR2CHAR(su->su_badptr); 3727 if (!SPELL_ISUPPER(c) && attr == HLF_COUNT) 3728 { 3729 make_case_word(su->su_badword, buf, WF_ONECAP); 3730 add_suggestion(su, &su->su_ga, buf, su->su_badlen, SCORE_ICASE, 3731 0, TRUE, su->su_sallang, FALSE); 3732 } 3733 3734 /* Ban the bad word itself. It may appear in another region. */ 3735 if (banbadword) 3736 add_banned(su, su->su_badword); 3737 3738 /* Make a copy of 'spellsuggest', because the expression may change it. */ 3739 sps_copy = vim_strsave(p_sps); 3740 if (sps_copy == NULL) 3741 return; 3742 3743 /* Loop over the items in 'spellsuggest'. */ 3744 for (p = sps_copy; *p != NUL; ) 3745 { 3746 copy_option_part(&p, buf, MAXPATHL, ","); 3747 3748 if (STRNCMP(buf, "expr:", 5) == 0) 3749 { 3750 #ifdef FEAT_EVAL 3751 /* Evaluate an expression. Skip this when called recursively, 3752 * when using spellsuggest() in the expression. */ 3753 if (!expr_busy) 3754 { 3755 expr_busy = TRUE; 3756 spell_suggest_expr(su, buf + 5); 3757 expr_busy = FALSE; 3758 } 3759 #endif 3760 } 3761 else if (STRNCMP(buf, "file:", 5) == 0) 3762 /* Use list of suggestions in a file. */ 3763 spell_suggest_file(su, buf + 5); 3764 else 3765 { 3766 /* Use internal method. */ 3767 spell_suggest_intern(su, interactive); 3768 if (sps_flags & SPS_DOUBLE) 3769 do_combine = TRUE; 3770 } 3771 } 3772 3773 vim_free(sps_copy); 3774 3775 if (do_combine) 3776 /* Combine the two list of suggestions. This must be done last, 3777 * because sorting changes the order again. */ 3778 score_combine(su); 3779 } 3780 3781 #ifdef FEAT_EVAL 3782 /* 3783 * Find suggestions by evaluating expression "expr". 3784 */ 3785 static void 3786 spell_suggest_expr(suginfo_T *su, char_u *expr) 3787 { 3788 list_T *list; 3789 listitem_T *li; 3790 int score; 3791 char_u *p; 3792 3793 /* The work is split up in a few parts to avoid having to export 3794 * suginfo_T. 3795 * First evaluate the expression and get the resulting list. */ 3796 list = eval_spell_expr(su->su_badword, expr); 3797 if (list != NULL) 3798 { 3799 /* Loop over the items in the list. */ 3800 for (li = list->lv_first; li != NULL; li = li->li_next) 3801 if (li->li_tv.v_type == VAR_LIST) 3802 { 3803 /* Get the word and the score from the items. */ 3804 score = get_spellword(li->li_tv.vval.v_list, &p); 3805 if (score >= 0 && score <= su->su_maxscore) 3806 add_suggestion(su, &su->su_ga, p, su->su_badlen, 3807 score, 0, TRUE, su->su_sallang, FALSE); 3808 } 3809 list_unref(list); 3810 } 3811 3812 /* Remove bogus suggestions, sort and truncate at "maxcount". */ 3813 check_suggestions(su, &su->su_ga); 3814 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); 3815 } 3816 #endif 3817 3818 /* 3819 * Find suggestions in file "fname". Used for "file:" in 'spellsuggest'. 3820 */ 3821 static void 3822 spell_suggest_file(suginfo_T *su, char_u *fname) 3823 { 3824 FILE *fd; 3825 char_u line[MAXWLEN * 2]; 3826 char_u *p; 3827 int len; 3828 char_u cword[MAXWLEN]; 3829 3830 /* Open the file. */ 3831 fd = mch_fopen((char *)fname, "r"); 3832 if (fd == NULL) 3833 { 3834 semsg(_(e_notopen), fname); 3835 return; 3836 } 3837 3838 /* Read it line by line. */ 3839 while (!vim_fgets(line, MAXWLEN * 2, fd) && !got_int) 3840 { 3841 line_breakcheck(); 3842 3843 p = vim_strchr(line, '/'); 3844 if (p == NULL) 3845 continue; /* No Tab found, just skip the line. */ 3846 *p++ = NUL; 3847 if (STRICMP(su->su_badword, line) == 0) 3848 { 3849 /* Match! Isolate the good word, until CR or NL. */ 3850 for (len = 0; p[len] >= ' '; ++len) 3851 ; 3852 p[len] = NUL; 3853 3854 /* If the suggestion doesn't have specific case duplicate the case 3855 * of the bad word. */ 3856 if (captype(p, NULL) == 0) 3857 { 3858 make_case_word(p, cword, su->su_badflags); 3859 p = cword; 3860 } 3861 3862 add_suggestion(su, &su->su_ga, p, su->su_badlen, 3863 SCORE_FILE, 0, TRUE, su->su_sallang, FALSE); 3864 } 3865 } 3866 3867 fclose(fd); 3868 3869 /* Remove bogus suggestions, sort and truncate at "maxcount". */ 3870 check_suggestions(su, &su->su_ga); 3871 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); 3872 } 3873 3874 /* 3875 * Find suggestions for the internal method indicated by "sps_flags". 3876 */ 3877 static void 3878 spell_suggest_intern(suginfo_T *su, int interactive) 3879 { 3880 /* 3881 * Load the .sug file(s) that are available and not done yet. 3882 */ 3883 suggest_load_files(); 3884 3885 /* 3886 * 1. Try special cases, such as repeating a word: "the the" -> "the". 3887 * 3888 * Set a maximum score to limit the combination of operations that is 3889 * tried. 3890 */ 3891 suggest_try_special(su); 3892 3893 /* 3894 * 2. Try inserting/deleting/swapping/changing a letter, use REP entries 3895 * from the .aff file and inserting a space (split the word). 3896 */ 3897 suggest_try_change(su); 3898 3899 /* For the resulting top-scorers compute the sound-a-like score. */ 3900 if (sps_flags & SPS_DOUBLE) 3901 score_comp_sal(su); 3902 3903 /* 3904 * 3. Try finding sound-a-like words. 3905 */ 3906 if ((sps_flags & SPS_FAST) == 0) 3907 { 3908 if (sps_flags & SPS_BEST) 3909 /* Adjust the word score for the suggestions found so far for how 3910 * they sounds like. */ 3911 rescore_suggestions(su); 3912 3913 /* 3914 * While going through the soundfold tree "su_maxscore" is the score 3915 * for the soundfold word, limits the changes that are being tried, 3916 * and "su_sfmaxscore" the rescored score, which is set by 3917 * cleanup_suggestions(). 3918 * First find words with a small edit distance, because this is much 3919 * faster and often already finds the top-N suggestions. If we didn't 3920 * find many suggestions try again with a higher edit distance. 3921 * "sl_sounddone" is used to avoid doing the same word twice. 3922 */ 3923 suggest_try_soundalike_prep(); 3924 su->su_maxscore = SCORE_SFMAX1; 3925 su->su_sfmaxscore = SCORE_MAXINIT * 3; 3926 suggest_try_soundalike(su); 3927 if (su->su_ga.ga_len < SUG_CLEAN_COUNT(su)) 3928 { 3929 /* We didn't find enough matches, try again, allowing more 3930 * changes to the soundfold word. */ 3931 su->su_maxscore = SCORE_SFMAX2; 3932 suggest_try_soundalike(su); 3933 if (su->su_ga.ga_len < SUG_CLEAN_COUNT(su)) 3934 { 3935 /* Still didn't find enough matches, try again, allowing even 3936 * more changes to the soundfold word. */ 3937 su->su_maxscore = SCORE_SFMAX3; 3938 suggest_try_soundalike(su); 3939 } 3940 } 3941 su->su_maxscore = su->su_sfmaxscore; 3942 suggest_try_soundalike_finish(); 3943 } 3944 3945 /* When CTRL-C was hit while searching do show the results. Only clear 3946 * got_int when using a command, not for spellsuggest(). */ 3947 ui_breakcheck(); 3948 if (interactive && got_int) 3949 { 3950 (void)vgetc(); 3951 got_int = FALSE; 3952 } 3953 3954 if ((sps_flags & SPS_DOUBLE) == 0 && su->su_ga.ga_len != 0) 3955 { 3956 if (sps_flags & SPS_BEST) 3957 /* Adjust the word score for how it sounds like. */ 3958 rescore_suggestions(su); 3959 3960 /* Remove bogus suggestions, sort and truncate at "maxcount". */ 3961 check_suggestions(su, &su->su_ga); 3962 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); 3963 } 3964 } 3965 3966 /* 3967 * Free the info put in "*su" by spell_find_suggest(). 3968 */ 3969 static void 3970 spell_find_cleanup(suginfo_T *su) 3971 { 3972 int i; 3973 3974 /* Free the suggestions. */ 3975 for (i = 0; i < su->su_ga.ga_len; ++i) 3976 vim_free(SUG(su->su_ga, i).st_word); 3977 ga_clear(&su->su_ga); 3978 for (i = 0; i < su->su_sga.ga_len; ++i) 3979 vim_free(SUG(su->su_sga, i).st_word); 3980 ga_clear(&su->su_sga); 3981 3982 /* Free the banned words. */ 3983 hash_clear_all(&su->su_banned, 0); 3984 } 3985 3986 /* 3987 * Make a copy of "word", with the first letter upper or lower cased, to 3988 * "wcopy[MAXWLEN]". "word" must not be empty. 3989 * The result is NUL terminated. 3990 */ 3991 void 3992 onecap_copy( 3993 char_u *word, 3994 char_u *wcopy, 3995 int upper) /* TRUE: first letter made upper case */ 3996 { 3997 char_u *p; 3998 int c; 3999 int l; 4000 4001 p = word; 4002 if (has_mbyte) 4003 c = mb_cptr2char_adv(&p); 4004 else 4005 c = *p++; 4006 if (upper) 4007 c = SPELL_TOUPPER(c); 4008 else 4009 c = SPELL_TOFOLD(c); 4010 if (has_mbyte) 4011 l = mb_char2bytes(c, wcopy); 4012 else 4013 { 4014 l = 1; 4015 wcopy[0] = c; 4016 } 4017 vim_strncpy(wcopy + l, p, MAXWLEN - l - 1); 4018 } 4019 4020 /* 4021 * Make a copy of "word" with all the letters upper cased into 4022 * "wcopy[MAXWLEN]". The result is NUL terminated. 4023 */ 4024 static void 4025 allcap_copy(char_u *word, char_u *wcopy) 4026 { 4027 char_u *s; 4028 char_u *d; 4029 int c; 4030 4031 d = wcopy; 4032 for (s = word; *s != NUL; ) 4033 { 4034 if (has_mbyte) 4035 c = mb_cptr2char_adv(&s); 4036 else 4037 c = *s++; 4038 4039 /* We only change 0xdf to SS when we are certain latin1 is used. It 4040 * would cause weird errors in other 8-bit encodings. */ 4041 if (enc_latin1like && c == 0xdf) 4042 { 4043 c = 'S'; 4044 if (d - wcopy >= MAXWLEN - 1) 4045 break; 4046 *d++ = c; 4047 } 4048 else 4049 c = SPELL_TOUPPER(c); 4050 4051 if (has_mbyte) 4052 { 4053 if (d - wcopy >= MAXWLEN - MB_MAXBYTES) 4054 break; 4055 d += mb_char2bytes(c, d); 4056 } 4057 else 4058 { 4059 if (d - wcopy >= MAXWLEN - 1) 4060 break; 4061 *d++ = c; 4062 } 4063 } 4064 *d = NUL; 4065 } 4066 4067 /* 4068 * Try finding suggestions by recognizing specific situations. 4069 */ 4070 static void 4071 suggest_try_special(suginfo_T *su) 4072 { 4073 char_u *p; 4074 size_t len; 4075 int c; 4076 char_u word[MAXWLEN]; 4077 4078 /* 4079 * Recognize a word that is repeated: "the the". 4080 */ 4081 p = skiptowhite(su->su_fbadword); 4082 len = p - su->su_fbadword; 4083 p = skipwhite(p); 4084 if (STRLEN(p) == len && STRNCMP(su->su_fbadword, p, len) == 0) 4085 { 4086 /* Include badflags: if the badword is onecap or allcap 4087 * use that for the goodword too: "The the" -> "The". */ 4088 c = su->su_fbadword[len]; 4089 su->su_fbadword[len] = NUL; 4090 make_case_word(su->su_fbadword, word, su->su_badflags); 4091 su->su_fbadword[len] = c; 4092 4093 /* Give a soundalike score of 0, compute the score as if deleting one 4094 * character. */ 4095 add_suggestion(su, &su->su_ga, word, su->su_badlen, 4096 RESCORE(SCORE_REP, 0), 0, TRUE, su->su_sallang, FALSE); 4097 } 4098 } 4099 4100 /* 4101 * Change the 0 to 1 to measure how much time is spent in each state. 4102 * Output is dumped in "suggestprof". 4103 */ 4104 #if 0 4105 # define SUGGEST_PROFILE 4106 proftime_T current; 4107 proftime_T total; 4108 proftime_T times[STATE_FINAL + 1]; 4109 long counts[STATE_FINAL + 1]; 4110 4111 static void 4112 prof_init(void) 4113 { 4114 for (int i = 0; i <= STATE_FINAL; ++i) 4115 { 4116 profile_zero(×[i]); 4117 counts[i] = 0; 4118 } 4119 profile_start(¤t); 4120 profile_start(&total); 4121 } 4122 4123 /* call before changing state */ 4124 static void 4125 prof_store(state_T state) 4126 { 4127 profile_end(¤t); 4128 profile_add(×[state], ¤t); 4129 ++counts[state]; 4130 profile_start(¤t); 4131 } 4132 # define PROF_STORE(state) prof_store(state); 4133 4134 static void 4135 prof_report(char *name) 4136 { 4137 FILE *fd = fopen("suggestprof", "a"); 4138 4139 profile_end(&total); 4140 fprintf(fd, "-----------------------\n"); 4141 fprintf(fd, "%s: %s\n", name, profile_msg(&total)); 4142 for (int i = 0; i <= STATE_FINAL; ++i) 4143 fprintf(fd, "%d: %s (%ld)\n", i, profile_msg(×[i]), counts[i]); 4144 fclose(fd); 4145 } 4146 #else 4147 # define PROF_STORE(state) 4148 #endif 4149 4150 /* 4151 * Try finding suggestions by adding/removing/swapping letters. 4152 */ 4153 static void 4154 suggest_try_change(suginfo_T *su) 4155 { 4156 char_u fword[MAXWLEN]; /* copy of the bad word, case-folded */ 4157 int n; 4158 char_u *p; 4159 int lpi; 4160 langp_T *lp; 4161 4162 /* We make a copy of the case-folded bad word, so that we can modify it 4163 * to find matches (esp. REP items). Append some more text, changing 4164 * chars after the bad word may help. */ 4165 STRCPY(fword, su->su_fbadword); 4166 n = (int)STRLEN(fword); 4167 p = su->su_badptr + su->su_badlen; 4168 (void)spell_casefold(p, (int)STRLEN(p), fword + n, MAXWLEN - n); 4169 4170 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 4171 { 4172 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 4173 4174 /* If reloading a spell file fails it's still in the list but 4175 * everything has been cleared. */ 4176 if (lp->lp_slang->sl_fbyts == NULL) 4177 continue; 4178 4179 /* Try it for this language. Will add possible suggestions. */ 4180 #ifdef SUGGEST_PROFILE 4181 prof_init(); 4182 #endif 4183 suggest_trie_walk(su, lp, fword, FALSE); 4184 #ifdef SUGGEST_PROFILE 4185 prof_report("try_change"); 4186 #endif 4187 } 4188 } 4189 4190 /* Check the maximum score, if we go over it we won't try this change. */ 4191 #define TRY_DEEPER(su, stack, depth, add) \ 4192 (stack[depth].ts_score + (add) < su->su_maxscore) 4193 4194 /* 4195 * Try finding suggestions by adding/removing/swapping letters. 4196 * 4197 * This uses a state machine. At each node in the tree we try various 4198 * operations. When trying if an operation works "depth" is increased and the 4199 * stack[] is used to store info. This allows combinations, thus insert one 4200 * character, replace one and delete another. The number of changes is 4201 * limited by su->su_maxscore. 4202 * 4203 * After implementing this I noticed an article by Kemal Oflazer that 4204 * describes something similar: "Error-tolerant Finite State Recognition with 4205 * Applications to Morphological Analysis and Spelling Correction" (1996). 4206 * The implementation in the article is simplified and requires a stack of 4207 * unknown depth. The implementation here only needs a stack depth equal to 4208 * the length of the word. 4209 * 4210 * This is also used for the sound-folded word, "soundfold" is TRUE then. 4211 * The mechanism is the same, but we find a match with a sound-folded word 4212 * that comes from one or more original words. Each of these words may be 4213 * added, this is done by add_sound_suggest(). 4214 * Don't use: 4215 * the prefix tree or the keep-case tree 4216 * "su->su_badlen" 4217 * anything to do with upper and lower case 4218 * anything to do with word or non-word characters ("spell_iswordp()") 4219 * banned words 4220 * word flags (rare, region, compounding) 4221 * word splitting for now 4222 * "similar_chars()" 4223 * use "slang->sl_repsal" instead of "lp->lp_replang->sl_rep" 4224 */ 4225 static void 4226 suggest_trie_walk( 4227 suginfo_T *su, 4228 langp_T *lp, 4229 char_u *fword, 4230 int soundfold) 4231 { 4232 char_u tword[MAXWLEN]; /* good word collected so far */ 4233 trystate_T stack[MAXWLEN]; 4234 char_u preword[MAXWLEN * 3]; /* word found with proper case; 4235 * concatenation of prefix compound 4236 * words and split word. NUL terminated 4237 * when going deeper but not when coming 4238 * back. */ 4239 char_u compflags[MAXWLEN]; /* compound flags, one for each word */ 4240 trystate_T *sp; 4241 int newscore; 4242 int score; 4243 char_u *byts, *fbyts, *pbyts; 4244 idx_T *idxs, *fidxs, *pidxs; 4245 int depth; 4246 int c, c2, c3; 4247 int n = 0; 4248 int flags; 4249 garray_T *gap; 4250 idx_T arridx; 4251 int len; 4252 char_u *p; 4253 fromto_T *ftp; 4254 int fl = 0, tl; 4255 int repextra = 0; /* extra bytes in fword[] from REP item */ 4256 slang_T *slang = lp->lp_slang; 4257 int fword_ends; 4258 int goodword_ends; 4259 #ifdef DEBUG_TRIEWALK 4260 /* Stores the name of the change made at each level. */ 4261 char_u changename[MAXWLEN][80]; 4262 #endif 4263 int breakcheckcount = 1000; 4264 int compound_ok; 4265 4266 /* 4267 * Go through the whole case-fold tree, try changes at each node. 4268 * "tword[]" contains the word collected from nodes in the tree. 4269 * "fword[]" the word we are trying to match with (initially the bad 4270 * word). 4271 */ 4272 depth = 0; 4273 sp = &stack[0]; 4274 vim_memset(sp, 0, sizeof(trystate_T)); 4275 sp->ts_curi = 1; 4276 4277 if (soundfold) 4278 { 4279 /* Going through the soundfold tree. */ 4280 byts = fbyts = slang->sl_sbyts; 4281 idxs = fidxs = slang->sl_sidxs; 4282 pbyts = NULL; 4283 pidxs = NULL; 4284 sp->ts_prefixdepth = PFD_NOPREFIX; 4285 sp->ts_state = STATE_START; 4286 } 4287 else 4288 { 4289 /* 4290 * When there are postponed prefixes we need to use these first. At 4291 * the end of the prefix we continue in the case-fold tree. 4292 */ 4293 fbyts = slang->sl_fbyts; 4294 fidxs = slang->sl_fidxs; 4295 pbyts = slang->sl_pbyts; 4296 pidxs = slang->sl_pidxs; 4297 if (pbyts != NULL) 4298 { 4299 byts = pbyts; 4300 idxs = pidxs; 4301 sp->ts_prefixdepth = PFD_PREFIXTREE; 4302 sp->ts_state = STATE_NOPREFIX; /* try without prefix first */ 4303 } 4304 else 4305 { 4306 byts = fbyts; 4307 idxs = fidxs; 4308 sp->ts_prefixdepth = PFD_NOPREFIX; 4309 sp->ts_state = STATE_START; 4310 } 4311 } 4312 4313 /* 4314 * Loop to find all suggestions. At each round we either: 4315 * - For the current state try one operation, advance "ts_curi", 4316 * increase "depth". 4317 * - When a state is done go to the next, set "ts_state". 4318 * - When all states are tried decrease "depth". 4319 */ 4320 while (depth >= 0 && !got_int) 4321 { 4322 sp = &stack[depth]; 4323 switch (sp->ts_state) 4324 { 4325 case STATE_START: 4326 case STATE_NOPREFIX: 4327 /* 4328 * Start of node: Deal with NUL bytes, which means 4329 * tword[] may end here. 4330 */ 4331 arridx = sp->ts_arridx; /* current node in the tree */ 4332 len = byts[arridx]; /* bytes in this node */ 4333 arridx += sp->ts_curi; /* index of current byte */ 4334 4335 if (sp->ts_prefixdepth == PFD_PREFIXTREE) 4336 { 4337 /* Skip over the NUL bytes, we use them later. */ 4338 for (n = 0; n < len && byts[arridx + n] == 0; ++n) 4339 ; 4340 sp->ts_curi += n; 4341 4342 /* Always past NUL bytes now. */ 4343 n = (int)sp->ts_state; 4344 PROF_STORE(sp->ts_state) 4345 sp->ts_state = STATE_ENDNUL; 4346 sp->ts_save_badflags = su->su_badflags; 4347 4348 /* At end of a prefix or at start of prefixtree: check for 4349 * following word. */ 4350 if (byts[arridx] == 0 || n == (int)STATE_NOPREFIX) 4351 { 4352 /* Set su->su_badflags to the caps type at this position. 4353 * Use the caps type until here for the prefix itself. */ 4354 if (has_mbyte) 4355 n = nofold_len(fword, sp->ts_fidx, su->su_badptr); 4356 else 4357 n = sp->ts_fidx; 4358 flags = badword_captype(su->su_badptr, su->su_badptr + n); 4359 su->su_badflags = badword_captype(su->su_badptr + n, 4360 su->su_badptr + su->su_badlen); 4361 #ifdef DEBUG_TRIEWALK 4362 sprintf(changename[depth], "prefix"); 4363 #endif 4364 go_deeper(stack, depth, 0); 4365 ++depth; 4366 sp = &stack[depth]; 4367 sp->ts_prefixdepth = depth - 1; 4368 byts = fbyts; 4369 idxs = fidxs; 4370 sp->ts_arridx = 0; 4371 4372 /* Move the prefix to preword[] with the right case 4373 * and make find_keepcap_word() works. */ 4374 tword[sp->ts_twordlen] = NUL; 4375 make_case_word(tword + sp->ts_splitoff, 4376 preword + sp->ts_prewordlen, flags); 4377 sp->ts_prewordlen = (char_u)STRLEN(preword); 4378 sp->ts_splitoff = sp->ts_twordlen; 4379 } 4380 break; 4381 } 4382 4383 if (sp->ts_curi > len || byts[arridx] != 0) 4384 { 4385 /* Past bytes in node and/or past NUL bytes. */ 4386 PROF_STORE(sp->ts_state) 4387 sp->ts_state = STATE_ENDNUL; 4388 sp->ts_save_badflags = su->su_badflags; 4389 break; 4390 } 4391 4392 /* 4393 * End of word in tree. 4394 */ 4395 ++sp->ts_curi; /* eat one NUL byte */ 4396 4397 flags = (int)idxs[arridx]; 4398 4399 /* Skip words with the NOSUGGEST flag. */ 4400 if (flags & WF_NOSUGGEST) 4401 break; 4402 4403 fword_ends = (fword[sp->ts_fidx] == NUL 4404 || (soundfold 4405 ? VIM_ISWHITE(fword[sp->ts_fidx]) 4406 : !spell_iswordp(fword + sp->ts_fidx, curwin))); 4407 tword[sp->ts_twordlen] = NUL; 4408 4409 if (sp->ts_prefixdepth <= PFD_NOTSPECIAL 4410 && (sp->ts_flags & TSF_PREFIXOK) == 0) 4411 { 4412 /* There was a prefix before the word. Check that the prefix 4413 * can be used with this word. */ 4414 /* Count the length of the NULs in the prefix. If there are 4415 * none this must be the first try without a prefix. */ 4416 n = stack[sp->ts_prefixdepth].ts_arridx; 4417 len = pbyts[n++]; 4418 for (c = 0; c < len && pbyts[n + c] == 0; ++c) 4419 ; 4420 if (c > 0) 4421 { 4422 c = valid_word_prefix(c, n, flags, 4423 tword + sp->ts_splitoff, slang, FALSE); 4424 if (c == 0) 4425 break; 4426 4427 /* Use the WF_RARE flag for a rare prefix. */ 4428 if (c & WF_RAREPFX) 4429 flags |= WF_RARE; 4430 4431 /* Tricky: when checking for both prefix and compounding 4432 * we run into the prefix flag first. 4433 * Remember that it's OK, so that we accept the prefix 4434 * when arriving at a compound flag. */ 4435 sp->ts_flags |= TSF_PREFIXOK; 4436 } 4437 } 4438 4439 /* Check NEEDCOMPOUND: can't use word without compounding. Do try 4440 * appending another compound word below. */ 4441 if (sp->ts_complen == sp->ts_compsplit && fword_ends 4442 && (flags & WF_NEEDCOMP)) 4443 goodword_ends = FALSE; 4444 else 4445 goodword_ends = TRUE; 4446 4447 p = NULL; 4448 compound_ok = TRUE; 4449 if (sp->ts_complen > sp->ts_compsplit) 4450 { 4451 if (slang->sl_nobreak) 4452 { 4453 /* There was a word before this word. When there was no 4454 * change in this word (it was correct) add the first word 4455 * as a suggestion. If this word was corrected too, we 4456 * need to check if a correct word follows. */ 4457 if (sp->ts_fidx - sp->ts_splitfidx 4458 == sp->ts_twordlen - sp->ts_splitoff 4459 && STRNCMP(fword + sp->ts_splitfidx, 4460 tword + sp->ts_splitoff, 4461 sp->ts_fidx - sp->ts_splitfidx) == 0) 4462 { 4463 preword[sp->ts_prewordlen] = NUL; 4464 newscore = score_wordcount_adj(slang, sp->ts_score, 4465 preword + sp->ts_prewordlen, 4466 sp->ts_prewordlen > 0); 4467 /* Add the suggestion if the score isn't too bad. */ 4468 if (newscore <= su->su_maxscore) 4469 add_suggestion(su, &su->su_ga, preword, 4470 sp->ts_splitfidx - repextra, 4471 newscore, 0, FALSE, 4472 lp->lp_sallang, FALSE); 4473 break; 4474 } 4475 } 4476 else 4477 { 4478 /* There was a compound word before this word. If this 4479 * word does not support compounding then give up 4480 * (splitting is tried for the word without compound 4481 * flag). */ 4482 if (((unsigned)flags >> 24) == 0 4483 || sp->ts_twordlen - sp->ts_splitoff 4484 < slang->sl_compminlen) 4485 break; 4486 /* For multi-byte chars check character length against 4487 * COMPOUNDMIN. */ 4488 if (has_mbyte 4489 && slang->sl_compminlen > 0 4490 && mb_charlen(tword + sp->ts_splitoff) 4491 < slang->sl_compminlen) 4492 break; 4493 4494 compflags[sp->ts_complen] = ((unsigned)flags >> 24); 4495 compflags[sp->ts_complen + 1] = NUL; 4496 vim_strncpy(preword + sp->ts_prewordlen, 4497 tword + sp->ts_splitoff, 4498 sp->ts_twordlen - sp->ts_splitoff); 4499 4500 /* Verify CHECKCOMPOUNDPATTERN rules. */ 4501 if (match_checkcompoundpattern(preword, sp->ts_prewordlen, 4502 &slang->sl_comppat)) 4503 compound_ok = FALSE; 4504 4505 if (compound_ok) 4506 { 4507 p = preword; 4508 while (*skiptowhite(p) != NUL) 4509 p = skipwhite(skiptowhite(p)); 4510 if (fword_ends && !can_compound(slang, p, 4511 compflags + sp->ts_compsplit)) 4512 /* Compound is not allowed. But it may still be 4513 * possible if we add another (short) word. */ 4514 compound_ok = FALSE; 4515 } 4516 4517 /* Get pointer to last char of previous word. */ 4518 p = preword + sp->ts_prewordlen; 4519 MB_PTR_BACK(preword, p); 4520 } 4521 } 4522 4523 /* 4524 * Form the word with proper case in preword. 4525 * If there is a word from a previous split, append. 4526 * For the soundfold tree don't change the case, simply append. 4527 */ 4528 if (soundfold) 4529 STRCPY(preword + sp->ts_prewordlen, tword + sp->ts_splitoff); 4530 else if (flags & WF_KEEPCAP) 4531 /* Must find the word in the keep-case tree. */ 4532 find_keepcap_word(slang, tword + sp->ts_splitoff, 4533 preword + sp->ts_prewordlen); 4534 else 4535 { 4536 /* Include badflags: If the badword is onecap or allcap 4537 * use that for the goodword too. But if the badword is 4538 * allcap and it's only one char long use onecap. */ 4539 c = su->su_badflags; 4540 if ((c & WF_ALLCAP) 4541 && su->su_badlen == (*mb_ptr2len)(su->su_badptr)) 4542 c = WF_ONECAP; 4543 c |= flags; 4544 4545 /* When appending a compound word after a word character don't 4546 * use Onecap. */ 4547 if (p != NULL && spell_iswordp_nmw(p, curwin)) 4548 c &= ~WF_ONECAP; 4549 make_case_word(tword + sp->ts_splitoff, 4550 preword + sp->ts_prewordlen, c); 4551 } 4552 4553 if (!soundfold) 4554 { 4555 /* Don't use a banned word. It may appear again as a good 4556 * word, thus remember it. */ 4557 if (flags & WF_BANNED) 4558 { 4559 add_banned(su, preword + sp->ts_prewordlen); 4560 break; 4561 } 4562 if ((sp->ts_complen == sp->ts_compsplit 4563 && WAS_BANNED(su, preword + sp->ts_prewordlen)) 4564 || WAS_BANNED(su, preword)) 4565 { 4566 if (slang->sl_compprog == NULL) 4567 break; 4568 /* the word so far was banned but we may try compounding */ 4569 goodword_ends = FALSE; 4570 } 4571 } 4572 4573 newscore = 0; 4574 if (!soundfold) /* soundfold words don't have flags */ 4575 { 4576 if ((flags & WF_REGION) 4577 && (((unsigned)flags >> 16) & lp->lp_region) == 0) 4578 newscore += SCORE_REGION; 4579 if (flags & WF_RARE) 4580 newscore += SCORE_RARE; 4581 4582 if (!spell_valid_case(su->su_badflags, 4583 captype(preword + sp->ts_prewordlen, NULL))) 4584 newscore += SCORE_ICASE; 4585 } 4586 4587 /* TODO: how about splitting in the soundfold tree? */ 4588 if (fword_ends 4589 && goodword_ends 4590 && sp->ts_fidx >= sp->ts_fidxtry 4591 && compound_ok) 4592 { 4593 /* The badword also ends: add suggestions. */ 4594 #ifdef DEBUG_TRIEWALK 4595 if (soundfold && STRCMP(preword, "smwrd") == 0) 4596 { 4597 int j; 4598 4599 /* print the stack of changes that brought us here */ 4600 smsg("------ %s -------", fword); 4601 for (j = 0; j < depth; ++j) 4602 smsg("%s", changename[j]); 4603 } 4604 #endif 4605 if (soundfold) 4606 { 4607 /* For soundfolded words we need to find the original 4608 * words, the edit distance and then add them. */ 4609 add_sound_suggest(su, preword, sp->ts_score, lp); 4610 } 4611 else if (sp->ts_fidx > 0) 4612 { 4613 /* Give a penalty when changing non-word char to word 4614 * char, e.g., "thes," -> "these". */ 4615 p = fword + sp->ts_fidx; 4616 MB_PTR_BACK(fword, p); 4617 if (!spell_iswordp(p, curwin)) 4618 { 4619 p = preword + STRLEN(preword); 4620 MB_PTR_BACK(preword, p); 4621 if (spell_iswordp(p, curwin)) 4622 newscore += SCORE_NONWORD; 4623 } 4624 4625 /* Give a bonus to words seen before. */ 4626 score = score_wordcount_adj(slang, 4627 sp->ts_score + newscore, 4628 preword + sp->ts_prewordlen, 4629 sp->ts_prewordlen > 0); 4630 4631 /* Add the suggestion if the score isn't too bad. */ 4632 if (score <= su->su_maxscore) 4633 { 4634 add_suggestion(su, &su->su_ga, preword, 4635 sp->ts_fidx - repextra, 4636 score, 0, FALSE, lp->lp_sallang, FALSE); 4637 4638 if (su->su_badflags & WF_MIXCAP) 4639 { 4640 /* We really don't know if the word should be 4641 * upper or lower case, add both. */ 4642 c = captype(preword, NULL); 4643 if (c == 0 || c == WF_ALLCAP) 4644 { 4645 make_case_word(tword + sp->ts_splitoff, 4646 preword + sp->ts_prewordlen, 4647 c == 0 ? WF_ALLCAP : 0); 4648 4649 add_suggestion(su, &su->su_ga, preword, 4650 sp->ts_fidx - repextra, 4651 score + SCORE_ICASE, 0, FALSE, 4652 lp->lp_sallang, FALSE); 4653 } 4654 } 4655 } 4656 } 4657 } 4658 4659 /* 4660 * Try word split and/or compounding. 4661 */ 4662 if ((sp->ts_fidx >= sp->ts_fidxtry || fword_ends) 4663 /* Don't split halfway a character. */ 4664 && (!has_mbyte || sp->ts_tcharlen == 0)) 4665 { 4666 int try_compound; 4667 int try_split; 4668 4669 /* If past the end of the bad word don't try a split. 4670 * Otherwise try changing the next word. E.g., find 4671 * suggestions for "the the" where the second "the" is 4672 * different. It's done like a split. 4673 * TODO: word split for soundfold words */ 4674 try_split = (sp->ts_fidx - repextra < su->su_badlen) 4675 && !soundfold; 4676 4677 /* Get here in several situations: 4678 * 1. The word in the tree ends: 4679 * If the word allows compounding try that. Otherwise try 4680 * a split by inserting a space. For both check that a 4681 * valid words starts at fword[sp->ts_fidx]. 4682 * For NOBREAK do like compounding to be able to check if 4683 * the next word is valid. 4684 * 2. The badword does end, but it was due to a change (e.g., 4685 * a swap). No need to split, but do check that the 4686 * following word is valid. 4687 * 3. The badword and the word in the tree end. It may still 4688 * be possible to compound another (short) word. 4689 */ 4690 try_compound = FALSE; 4691 if (!soundfold 4692 && !slang->sl_nocompoundsugs 4693 && slang->sl_compprog != NULL 4694 && ((unsigned)flags >> 24) != 0 4695 && sp->ts_twordlen - sp->ts_splitoff 4696 >= slang->sl_compminlen 4697 && (!has_mbyte 4698 || slang->sl_compminlen == 0 4699 || mb_charlen(tword + sp->ts_splitoff) 4700 >= slang->sl_compminlen) 4701 && (slang->sl_compsylmax < MAXWLEN 4702 || sp->ts_complen + 1 - sp->ts_compsplit 4703 < slang->sl_compmax) 4704 && (can_be_compound(sp, slang, 4705 compflags, ((unsigned)flags >> 24)))) 4706 4707 { 4708 try_compound = TRUE; 4709 compflags[sp->ts_complen] = ((unsigned)flags >> 24); 4710 compflags[sp->ts_complen + 1] = NUL; 4711 } 4712 4713 /* For NOBREAK we never try splitting, it won't make any word 4714 * valid. */ 4715 if (slang->sl_nobreak && !slang->sl_nocompoundsugs) 4716 try_compound = TRUE; 4717 4718 /* If we could add a compound word, and it's also possible to 4719 * split at this point, do the split first and set 4720 * TSF_DIDSPLIT to avoid doing it again. */ 4721 else if (!fword_ends 4722 && try_compound 4723 && (sp->ts_flags & TSF_DIDSPLIT) == 0) 4724 { 4725 try_compound = FALSE; 4726 sp->ts_flags |= TSF_DIDSPLIT; 4727 --sp->ts_curi; /* do the same NUL again */ 4728 compflags[sp->ts_complen] = NUL; 4729 } 4730 else 4731 sp->ts_flags &= ~TSF_DIDSPLIT; 4732 4733 if (try_split || try_compound) 4734 { 4735 if (!try_compound && (!fword_ends || !goodword_ends)) 4736 { 4737 /* If we're going to split need to check that the 4738 * words so far are valid for compounding. If there 4739 * is only one word it must not have the NEEDCOMPOUND 4740 * flag. */ 4741 if (sp->ts_complen == sp->ts_compsplit 4742 && (flags & WF_NEEDCOMP)) 4743 break; 4744 p = preword; 4745 while (*skiptowhite(p) != NUL) 4746 p = skipwhite(skiptowhite(p)); 4747 if (sp->ts_complen > sp->ts_compsplit 4748 && !can_compound(slang, p, 4749 compflags + sp->ts_compsplit)) 4750 break; 4751 4752 if (slang->sl_nosplitsugs) 4753 newscore += SCORE_SPLIT_NO; 4754 else 4755 newscore += SCORE_SPLIT; 4756 4757 /* Give a bonus to words seen before. */ 4758 newscore = score_wordcount_adj(slang, newscore, 4759 preword + sp->ts_prewordlen, TRUE); 4760 } 4761 4762 if (TRY_DEEPER(su, stack, depth, newscore)) 4763 { 4764 go_deeper(stack, depth, newscore); 4765 #ifdef DEBUG_TRIEWALK 4766 if (!try_compound && !fword_ends) 4767 sprintf(changename[depth], "%.*s-%s: split", 4768 sp->ts_twordlen, tword, fword + sp->ts_fidx); 4769 else 4770 sprintf(changename[depth], "%.*s-%s: compound", 4771 sp->ts_twordlen, tword, fword + sp->ts_fidx); 4772 #endif 4773 /* Save things to be restored at STATE_SPLITUNDO. */ 4774 sp->ts_save_badflags = su->su_badflags; 4775 PROF_STORE(sp->ts_state) 4776 sp->ts_state = STATE_SPLITUNDO; 4777 4778 ++depth; 4779 sp = &stack[depth]; 4780 4781 /* Append a space to preword when splitting. */ 4782 if (!try_compound && !fword_ends) 4783 STRCAT(preword, " "); 4784 sp->ts_prewordlen = (char_u)STRLEN(preword); 4785 sp->ts_splitoff = sp->ts_twordlen; 4786 sp->ts_splitfidx = sp->ts_fidx; 4787 4788 /* If the badword has a non-word character at this 4789 * position skip it. That means replacing the 4790 * non-word character with a space. Always skip a 4791 * character when the word ends. But only when the 4792 * good word can end. */ 4793 if (((!try_compound && !spell_iswordp_nmw(fword 4794 + sp->ts_fidx, 4795 curwin)) 4796 || fword_ends) 4797 && fword[sp->ts_fidx] != NUL 4798 && goodword_ends) 4799 { 4800 int l; 4801 4802 l = MB_PTR2LEN(fword + sp->ts_fidx); 4803 if (fword_ends) 4804 { 4805 /* Copy the skipped character to preword. */ 4806 mch_memmove(preword + sp->ts_prewordlen, 4807 fword + sp->ts_fidx, l); 4808 sp->ts_prewordlen += l; 4809 preword[sp->ts_prewordlen] = NUL; 4810 } 4811 else 4812 sp->ts_score -= SCORE_SPLIT - SCORE_SUBST; 4813 sp->ts_fidx += l; 4814 } 4815 4816 /* When compounding include compound flag in 4817 * compflags[] (already set above). When splitting we 4818 * may start compounding over again. */ 4819 if (try_compound) 4820 ++sp->ts_complen; 4821 else 4822 sp->ts_compsplit = sp->ts_complen; 4823 sp->ts_prefixdepth = PFD_NOPREFIX; 4824 4825 /* set su->su_badflags to the caps type at this 4826 * position */ 4827 if (has_mbyte) 4828 n = nofold_len(fword, sp->ts_fidx, su->su_badptr); 4829 else 4830 n = sp->ts_fidx; 4831 su->su_badflags = badword_captype(su->su_badptr + n, 4832 su->su_badptr + su->su_badlen); 4833 4834 /* Restart at top of the tree. */ 4835 sp->ts_arridx = 0; 4836 4837 /* If there are postponed prefixes, try these too. */ 4838 if (pbyts != NULL) 4839 { 4840 byts = pbyts; 4841 idxs = pidxs; 4842 sp->ts_prefixdepth = PFD_PREFIXTREE; 4843 PROF_STORE(sp->ts_state) 4844 sp->ts_state = STATE_NOPREFIX; 4845 } 4846 } 4847 } 4848 } 4849 break; 4850 4851 case STATE_SPLITUNDO: 4852 /* Undo the changes done for word split or compound word. */ 4853 su->su_badflags = sp->ts_save_badflags; 4854 4855 /* Continue looking for NUL bytes. */ 4856 PROF_STORE(sp->ts_state) 4857 sp->ts_state = STATE_START; 4858 4859 /* In case we went into the prefix tree. */ 4860 byts = fbyts; 4861 idxs = fidxs; 4862 break; 4863 4864 case STATE_ENDNUL: 4865 /* Past the NUL bytes in the node. */ 4866 su->su_badflags = sp->ts_save_badflags; 4867 if (fword[sp->ts_fidx] == NUL && sp->ts_tcharlen == 0) 4868 { 4869 /* The badword ends, can't use STATE_PLAIN. */ 4870 PROF_STORE(sp->ts_state) 4871 sp->ts_state = STATE_DEL; 4872 break; 4873 } 4874 PROF_STORE(sp->ts_state) 4875 sp->ts_state = STATE_PLAIN; 4876 /* FALLTHROUGH */ 4877 4878 case STATE_PLAIN: 4879 /* 4880 * Go over all possible bytes at this node, add each to tword[] 4881 * and use child node. "ts_curi" is the index. 4882 */ 4883 arridx = sp->ts_arridx; 4884 if (sp->ts_curi > byts[arridx]) 4885 { 4886 /* Done all bytes at this node, do next state. When still at 4887 * already changed bytes skip the other tricks. */ 4888 PROF_STORE(sp->ts_state) 4889 if (sp->ts_fidx >= sp->ts_fidxtry) 4890 sp->ts_state = STATE_DEL; 4891 else 4892 sp->ts_state = STATE_FINAL; 4893 } 4894 else 4895 { 4896 arridx += sp->ts_curi++; 4897 c = byts[arridx]; 4898 4899 /* Normal byte, go one level deeper. If it's not equal to the 4900 * byte in the bad word adjust the score. But don't even try 4901 * when the byte was already changed. And don't try when we 4902 * just deleted this byte, accepting it is always cheaper than 4903 * delete + substitute. */ 4904 if (c == fword[sp->ts_fidx] 4905 || (sp->ts_tcharlen > 0 && sp->ts_isdiff != DIFF_NONE)) 4906 newscore = 0; 4907 else 4908 newscore = SCORE_SUBST; 4909 if ((newscore == 0 4910 || (sp->ts_fidx >= sp->ts_fidxtry 4911 && ((sp->ts_flags & TSF_DIDDEL) == 0 4912 || c != fword[sp->ts_delidx]))) 4913 && TRY_DEEPER(su, stack, depth, newscore)) 4914 { 4915 go_deeper(stack, depth, newscore); 4916 #ifdef DEBUG_TRIEWALK 4917 if (newscore > 0) 4918 sprintf(changename[depth], "%.*s-%s: subst %c to %c", 4919 sp->ts_twordlen, tword, fword + sp->ts_fidx, 4920 fword[sp->ts_fidx], c); 4921 else 4922 sprintf(changename[depth], "%.*s-%s: accept %c", 4923 sp->ts_twordlen, tword, fword + sp->ts_fidx, 4924 fword[sp->ts_fidx]); 4925 #endif 4926 ++depth; 4927 sp = &stack[depth]; 4928 ++sp->ts_fidx; 4929 tword[sp->ts_twordlen++] = c; 4930 sp->ts_arridx = idxs[arridx]; 4931 if (newscore == SCORE_SUBST) 4932 sp->ts_isdiff = DIFF_YES; 4933 if (has_mbyte) 4934 { 4935 /* Multi-byte characters are a bit complicated to 4936 * handle: They differ when any of the bytes differ 4937 * and then their length may also differ. */ 4938 if (sp->ts_tcharlen == 0) 4939 { 4940 /* First byte. */ 4941 sp->ts_tcharidx = 0; 4942 sp->ts_tcharlen = MB_BYTE2LEN(c); 4943 sp->ts_fcharstart = sp->ts_fidx - 1; 4944 sp->ts_isdiff = (newscore != 0) 4945 ? DIFF_YES : DIFF_NONE; 4946 } 4947 else if (sp->ts_isdiff == DIFF_INSERT) 4948 /* When inserting trail bytes don't advance in the 4949 * bad word. */ 4950 --sp->ts_fidx; 4951 if (++sp->ts_tcharidx == sp->ts_tcharlen) 4952 { 4953 /* Last byte of character. */ 4954 if (sp->ts_isdiff == DIFF_YES) 4955 { 4956 /* Correct ts_fidx for the byte length of the 4957 * character (we didn't check that before). */ 4958 sp->ts_fidx = sp->ts_fcharstart 4959 + MB_PTR2LEN( 4960 fword + sp->ts_fcharstart); 4961 /* For changing a composing character adjust 4962 * the score from SCORE_SUBST to 4963 * SCORE_SUBCOMP. */ 4964 if (enc_utf8 4965 && utf_iscomposing( 4966 utf_ptr2char(tword 4967 + sp->ts_twordlen 4968 - sp->ts_tcharlen)) 4969 && utf_iscomposing( 4970 utf_ptr2char(fword 4971 + sp->ts_fcharstart))) 4972 sp->ts_score -= 4973 SCORE_SUBST - SCORE_SUBCOMP; 4974 4975 /* For a similar character adjust score from 4976 * SCORE_SUBST to SCORE_SIMILAR. */ 4977 else if (!soundfold 4978 && slang->sl_has_map 4979 && similar_chars(slang, 4980 mb_ptr2char(tword 4981 + sp->ts_twordlen 4982 - sp->ts_tcharlen), 4983 mb_ptr2char(fword 4984 + sp->ts_fcharstart))) 4985 sp->ts_score -= 4986 SCORE_SUBST - SCORE_SIMILAR; 4987 } 4988 else if (sp->ts_isdiff == DIFF_INSERT 4989 && sp->ts_twordlen > sp->ts_tcharlen) 4990 { 4991 p = tword + sp->ts_twordlen - sp->ts_tcharlen; 4992 c = mb_ptr2char(p); 4993 if (enc_utf8 && utf_iscomposing(c)) 4994 { 4995 /* Inserting a composing char doesn't 4996 * count that much. */ 4997 sp->ts_score -= SCORE_INS - SCORE_INSCOMP; 4998 } 4999 else 5000 { 5001 /* If the previous character was the same, 5002 * thus doubling a character, give a bonus 5003 * to the score. Also for the soundfold 5004 * tree (might seem illogical but does 5005 * give better scores). */ 5006 MB_PTR_BACK(tword, p); 5007 if (c == mb_ptr2char(p)) 5008 sp->ts_score -= SCORE_INS 5009 - SCORE_INSDUP; 5010 } 5011 } 5012 5013 /* Starting a new char, reset the length. */ 5014 sp->ts_tcharlen = 0; 5015 } 5016 } 5017 else 5018 { 5019 /* If we found a similar char adjust the score. 5020 * We do this after calling go_deeper() because 5021 * it's slow. */ 5022 if (newscore != 0 5023 && !soundfold 5024 && slang->sl_has_map 5025 && similar_chars(slang, 5026 c, fword[sp->ts_fidx - 1])) 5027 sp->ts_score -= SCORE_SUBST - SCORE_SIMILAR; 5028 } 5029 } 5030 } 5031 break; 5032 5033 case STATE_DEL: 5034 /* When past the first byte of a multi-byte char don't try 5035 * delete/insert/swap a character. */ 5036 if (has_mbyte && sp->ts_tcharlen > 0) 5037 { 5038 PROF_STORE(sp->ts_state) 5039 sp->ts_state = STATE_FINAL; 5040 break; 5041 } 5042 /* 5043 * Try skipping one character in the bad word (delete it). 5044 */ 5045 PROF_STORE(sp->ts_state) 5046 sp->ts_state = STATE_INS_PREP; 5047 sp->ts_curi = 1; 5048 if (soundfold && sp->ts_fidx == 0 && fword[sp->ts_fidx] == '*') 5049 /* Deleting a vowel at the start of a word counts less, see 5050 * soundalike_score(). */ 5051 newscore = 2 * SCORE_DEL / 3; 5052 else 5053 newscore = SCORE_DEL; 5054 if (fword[sp->ts_fidx] != NUL 5055 && TRY_DEEPER(su, stack, depth, newscore)) 5056 { 5057 go_deeper(stack, depth, newscore); 5058 #ifdef DEBUG_TRIEWALK 5059 sprintf(changename[depth], "%.*s-%s: delete %c", 5060 sp->ts_twordlen, tword, fword + sp->ts_fidx, 5061 fword[sp->ts_fidx]); 5062 #endif 5063 ++depth; 5064 5065 /* Remember what character we deleted, so that we can avoid 5066 * inserting it again. */ 5067 stack[depth].ts_flags |= TSF_DIDDEL; 5068 stack[depth].ts_delidx = sp->ts_fidx; 5069 5070 /* Advance over the character in fword[]. Give a bonus to the 5071 * score if the same character is following "nn" -> "n". It's 5072 * a bit illogical for soundfold tree but it does give better 5073 * results. */ 5074 if (has_mbyte) 5075 { 5076 c = mb_ptr2char(fword + sp->ts_fidx); 5077 stack[depth].ts_fidx += MB_PTR2LEN(fword + sp->ts_fidx); 5078 if (enc_utf8 && utf_iscomposing(c)) 5079 stack[depth].ts_score -= SCORE_DEL - SCORE_DELCOMP; 5080 else if (c == mb_ptr2char(fword + stack[depth].ts_fidx)) 5081 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP; 5082 } 5083 else 5084 { 5085 ++stack[depth].ts_fidx; 5086 if (fword[sp->ts_fidx] == fword[sp->ts_fidx + 1]) 5087 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP; 5088 } 5089 break; 5090 } 5091 /* FALLTHROUGH */ 5092 5093 case STATE_INS_PREP: 5094 if (sp->ts_flags & TSF_DIDDEL) 5095 { 5096 /* If we just deleted a byte then inserting won't make sense, 5097 * a substitute is always cheaper. */ 5098 PROF_STORE(sp->ts_state) 5099 sp->ts_state = STATE_SWAP; 5100 break; 5101 } 5102 5103 /* skip over NUL bytes */ 5104 n = sp->ts_arridx; 5105 for (;;) 5106 { 5107 if (sp->ts_curi > byts[n]) 5108 { 5109 /* Only NUL bytes at this node, go to next state. */ 5110 PROF_STORE(sp->ts_state) 5111 sp->ts_state = STATE_SWAP; 5112 break; 5113 } 5114 if (byts[n + sp->ts_curi] != NUL) 5115 { 5116 /* Found a byte to insert. */ 5117 PROF_STORE(sp->ts_state) 5118 sp->ts_state = STATE_INS; 5119 break; 5120 } 5121 ++sp->ts_curi; 5122 } 5123 break; 5124 5125 /* FALLTHROUGH */ 5126 5127 case STATE_INS: 5128 /* Insert one byte. Repeat this for each possible byte at this 5129 * node. */ 5130 n = sp->ts_arridx; 5131 if (sp->ts_curi > byts[n]) 5132 { 5133 /* Done all bytes at this node, go to next state. */ 5134 PROF_STORE(sp->ts_state) 5135 sp->ts_state = STATE_SWAP; 5136 break; 5137 } 5138 5139 /* Do one more byte at this node, but: 5140 * - Skip NUL bytes. 5141 * - Skip the byte if it's equal to the byte in the word, 5142 * accepting that byte is always better. 5143 */ 5144 n += sp->ts_curi++; 5145 c = byts[n]; 5146 if (soundfold && sp->ts_twordlen == 0 && c == '*') 5147 /* Inserting a vowel at the start of a word counts less, 5148 * see soundalike_score(). */ 5149 newscore = 2 * SCORE_INS / 3; 5150 else 5151 newscore = SCORE_INS; 5152 if (c != fword[sp->ts_fidx] 5153 && TRY_DEEPER(su, stack, depth, newscore)) 5154 { 5155 go_deeper(stack, depth, newscore); 5156 #ifdef DEBUG_TRIEWALK 5157 sprintf(changename[depth], "%.*s-%s: insert %c", 5158 sp->ts_twordlen, tword, fword + sp->ts_fidx, 5159 c); 5160 #endif 5161 ++depth; 5162 sp = &stack[depth]; 5163 tword[sp->ts_twordlen++] = c; 5164 sp->ts_arridx = idxs[n]; 5165 if (has_mbyte) 5166 { 5167 fl = MB_BYTE2LEN(c); 5168 if (fl > 1) 5169 { 5170 /* There are following bytes for the same character. 5171 * We must find all bytes before trying 5172 * delete/insert/swap/etc. */ 5173 sp->ts_tcharlen = fl; 5174 sp->ts_tcharidx = 1; 5175 sp->ts_isdiff = DIFF_INSERT; 5176 } 5177 } 5178 else 5179 fl = 1; 5180 if (fl == 1) 5181 { 5182 /* If the previous character was the same, thus doubling a 5183 * character, give a bonus to the score. Also for 5184 * soundfold words (illogical but does give a better 5185 * score). */ 5186 if (sp->ts_twordlen >= 2 5187 && tword[sp->ts_twordlen - 2] == c) 5188 sp->ts_score -= SCORE_INS - SCORE_INSDUP; 5189 } 5190 } 5191 break; 5192 5193 case STATE_SWAP: 5194 /* 5195 * Swap two bytes in the bad word: "12" -> "21". 5196 * We change "fword" here, it's changed back afterwards at 5197 * STATE_UNSWAP. 5198 */ 5199 p = fword + sp->ts_fidx; 5200 c = *p; 5201 if (c == NUL) 5202 { 5203 /* End of word, can't swap or replace. */ 5204 PROF_STORE(sp->ts_state) 5205 sp->ts_state = STATE_FINAL; 5206 break; 5207 } 5208 5209 /* Don't swap if the first character is not a word character. 5210 * SWAP3 etc. also don't make sense then. */ 5211 if (!soundfold && !spell_iswordp(p, curwin)) 5212 { 5213 PROF_STORE(sp->ts_state) 5214 sp->ts_state = STATE_REP_INI; 5215 break; 5216 } 5217 5218 if (has_mbyte) 5219 { 5220 n = MB_CPTR2LEN(p); 5221 c = mb_ptr2char(p); 5222 if (p[n] == NUL) 5223 c2 = NUL; 5224 else if (!soundfold && !spell_iswordp(p + n, curwin)) 5225 c2 = c; /* don't swap non-word char */ 5226 else 5227 c2 = mb_ptr2char(p + n); 5228 } 5229 else 5230 { 5231 if (p[1] == NUL) 5232 c2 = NUL; 5233 else if (!soundfold && !spell_iswordp(p + 1, curwin)) 5234 c2 = c; /* don't swap non-word char */ 5235 else 5236 c2 = p[1]; 5237 } 5238 5239 /* When the second character is NUL we can't swap. */ 5240 if (c2 == NUL) 5241 { 5242 PROF_STORE(sp->ts_state) 5243 sp->ts_state = STATE_REP_INI; 5244 break; 5245 } 5246 5247 /* When characters are identical, swap won't do anything. 5248 * Also get here if the second char is not a word character. */ 5249 if (c == c2) 5250 { 5251 PROF_STORE(sp->ts_state) 5252 sp->ts_state = STATE_SWAP3; 5253 break; 5254 } 5255 if (c2 != NUL && TRY_DEEPER(su, stack, depth, SCORE_SWAP)) 5256 { 5257 go_deeper(stack, depth, SCORE_SWAP); 5258 #ifdef DEBUG_TRIEWALK 5259 sprintf(changename[depth], "%.*s-%s: swap %c and %c", 5260 sp->ts_twordlen, tword, fword + sp->ts_fidx, 5261 c, c2); 5262 #endif 5263 PROF_STORE(sp->ts_state) 5264 sp->ts_state = STATE_UNSWAP; 5265 ++depth; 5266 if (has_mbyte) 5267 { 5268 fl = mb_char2len(c2); 5269 mch_memmove(p, p + n, fl); 5270 mb_char2bytes(c, p + fl); 5271 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl; 5272 } 5273 else 5274 { 5275 p[0] = c2; 5276 p[1] = c; 5277 stack[depth].ts_fidxtry = sp->ts_fidx + 2; 5278 } 5279 } 5280 else 5281 { 5282 /* If this swap doesn't work then SWAP3 won't either. */ 5283 PROF_STORE(sp->ts_state) 5284 sp->ts_state = STATE_REP_INI; 5285 } 5286 break; 5287 5288 case STATE_UNSWAP: 5289 /* Undo the STATE_SWAP swap: "21" -> "12". */ 5290 p = fword + sp->ts_fidx; 5291 if (has_mbyte) 5292 { 5293 n = MB_PTR2LEN(p); 5294 c = mb_ptr2char(p + n); 5295 mch_memmove(p + MB_PTR2LEN(p + n), p, n); 5296 mb_char2bytes(c, p); 5297 } 5298 else 5299 { 5300 c = *p; 5301 *p = p[1]; 5302 p[1] = c; 5303 } 5304 /* FALLTHROUGH */ 5305 5306 case STATE_SWAP3: 5307 /* Swap two bytes, skipping one: "123" -> "321". We change 5308 * "fword" here, it's changed back afterwards at STATE_UNSWAP3. */ 5309 p = fword + sp->ts_fidx; 5310 if (has_mbyte) 5311 { 5312 n = MB_CPTR2LEN(p); 5313 c = mb_ptr2char(p); 5314 fl = MB_CPTR2LEN(p + n); 5315 c2 = mb_ptr2char(p + n); 5316 if (!soundfold && !spell_iswordp(p + n + fl, curwin)) 5317 c3 = c; /* don't swap non-word char */ 5318 else 5319 c3 = mb_ptr2char(p + n + fl); 5320 } 5321 else 5322 { 5323 c = *p; 5324 c2 = p[1]; 5325 if (!soundfold && !spell_iswordp(p + 2, curwin)) 5326 c3 = c; /* don't swap non-word char */ 5327 else 5328 c3 = p[2]; 5329 } 5330 5331 /* When characters are identical: "121" then SWAP3 result is 5332 * identical, ROT3L result is same as SWAP: "211", ROT3L result is 5333 * same as SWAP on next char: "112". Thus skip all swapping. 5334 * Also skip when c3 is NUL. 5335 * Also get here when the third character is not a word character. 5336 * Second character may any char: "a.b" -> "b.a" */ 5337 if (c == c3 || c3 == NUL) 5338 { 5339 PROF_STORE(sp->ts_state) 5340 sp->ts_state = STATE_REP_INI; 5341 break; 5342 } 5343 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3)) 5344 { 5345 go_deeper(stack, depth, SCORE_SWAP3); 5346 #ifdef DEBUG_TRIEWALK 5347 sprintf(changename[depth], "%.*s-%s: swap3 %c and %c", 5348 sp->ts_twordlen, tword, fword + sp->ts_fidx, 5349 c, c3); 5350 #endif 5351 PROF_STORE(sp->ts_state) 5352 sp->ts_state = STATE_UNSWAP3; 5353 ++depth; 5354 if (has_mbyte) 5355 { 5356 tl = mb_char2len(c3); 5357 mch_memmove(p, p + n + fl, tl); 5358 mb_char2bytes(c2, p + tl); 5359 mb_char2bytes(c, p + fl + tl); 5360 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl + tl; 5361 } 5362 else 5363 { 5364 p[0] = p[2]; 5365 p[2] = c; 5366 stack[depth].ts_fidxtry = sp->ts_fidx + 3; 5367 } 5368 } 5369 else 5370 { 5371 PROF_STORE(sp->ts_state) 5372 sp->ts_state = STATE_REP_INI; 5373 } 5374 break; 5375 5376 case STATE_UNSWAP3: 5377 /* Undo STATE_SWAP3: "321" -> "123" */ 5378 p = fword + sp->ts_fidx; 5379 if (has_mbyte) 5380 { 5381 n = MB_PTR2LEN(p); 5382 c2 = mb_ptr2char(p + n); 5383 fl = MB_PTR2LEN(p + n); 5384 c = mb_ptr2char(p + n + fl); 5385 tl = MB_PTR2LEN(p + n + fl); 5386 mch_memmove(p + fl + tl, p, n); 5387 mb_char2bytes(c, p); 5388 mb_char2bytes(c2, p + tl); 5389 p = p + tl; 5390 } 5391 else 5392 { 5393 c = *p; 5394 *p = p[2]; 5395 p[2] = c; 5396 ++p; 5397 } 5398 5399 if (!soundfold && !spell_iswordp(p, curwin)) 5400 { 5401 /* Middle char is not a word char, skip the rotate. First and 5402 * third char were already checked at swap and swap3. */ 5403 PROF_STORE(sp->ts_state) 5404 sp->ts_state = STATE_REP_INI; 5405 break; 5406 } 5407 5408 /* Rotate three characters left: "123" -> "231". We change 5409 * "fword" here, it's changed back afterwards at STATE_UNROT3L. */ 5410 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3)) 5411 { 5412 go_deeper(stack, depth, SCORE_SWAP3); 5413 #ifdef DEBUG_TRIEWALK 5414 p = fword + sp->ts_fidx; 5415 sprintf(changename[depth], "%.*s-%s: rotate left %c%c%c", 5416 sp->ts_twordlen, tword, fword + sp->ts_fidx, 5417 p[0], p[1], p[2]); 5418 #endif 5419 PROF_STORE(sp->ts_state) 5420 sp->ts_state = STATE_UNROT3L; 5421 ++depth; 5422 p = fword + sp->ts_fidx; 5423 if (has_mbyte) 5424 { 5425 n = MB_CPTR2LEN(p); 5426 c = mb_ptr2char(p); 5427 fl = MB_CPTR2LEN(p + n); 5428 fl += MB_CPTR2LEN(p + n + fl); 5429 mch_memmove(p, p + n, fl); 5430 mb_char2bytes(c, p + fl); 5431 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl; 5432 } 5433 else 5434 { 5435 c = *p; 5436 *p = p[1]; 5437 p[1] = p[2]; 5438 p[2] = c; 5439 stack[depth].ts_fidxtry = sp->ts_fidx + 3; 5440 } 5441 } 5442 else 5443 { 5444 PROF_STORE(sp->ts_state) 5445 sp->ts_state = STATE_REP_INI; 5446 } 5447 break; 5448 5449 case STATE_UNROT3L: 5450 /* Undo ROT3L: "231" -> "123" */ 5451 p = fword + sp->ts_fidx; 5452 if (has_mbyte) 5453 { 5454 n = MB_PTR2LEN(p); 5455 n += MB_PTR2LEN(p + n); 5456 c = mb_ptr2char(p + n); 5457 tl = MB_PTR2LEN(p + n); 5458 mch_memmove(p + tl, p, n); 5459 mb_char2bytes(c, p); 5460 } 5461 else 5462 { 5463 c = p[2]; 5464 p[2] = p[1]; 5465 p[1] = *p; 5466 *p = c; 5467 } 5468 5469 /* Rotate three bytes right: "123" -> "312". We change "fword" 5470 * here, it's changed back afterwards at STATE_UNROT3R. */ 5471 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3)) 5472 { 5473 go_deeper(stack, depth, SCORE_SWAP3); 5474 #ifdef DEBUG_TRIEWALK 5475 p = fword + sp->ts_fidx; 5476 sprintf(changename[depth], "%.*s-%s: rotate right %c%c%c", 5477 sp->ts_twordlen, tword, fword + sp->ts_fidx, 5478 p[0], p[1], p[2]); 5479 #endif 5480 PROF_STORE(sp->ts_state) 5481 sp->ts_state = STATE_UNROT3R; 5482 ++depth; 5483 p = fword + sp->ts_fidx; 5484 if (has_mbyte) 5485 { 5486 n = MB_CPTR2LEN(p); 5487 n += MB_CPTR2LEN(p + n); 5488 c = mb_ptr2char(p + n); 5489 tl = MB_CPTR2LEN(p + n); 5490 mch_memmove(p + tl, p, n); 5491 mb_char2bytes(c, p); 5492 stack[depth].ts_fidxtry = sp->ts_fidx + n + tl; 5493 } 5494 else 5495 { 5496 c = p[2]; 5497 p[2] = p[1]; 5498 p[1] = *p; 5499 *p = c; 5500 stack[depth].ts_fidxtry = sp->ts_fidx + 3; 5501 } 5502 } 5503 else 5504 { 5505 PROF_STORE(sp->ts_state) 5506 sp->ts_state = STATE_REP_INI; 5507 } 5508 break; 5509 5510 case STATE_UNROT3R: 5511 /* Undo ROT3R: "312" -> "123" */ 5512 p = fword + sp->ts_fidx; 5513 if (has_mbyte) 5514 { 5515 c = mb_ptr2char(p); 5516 tl = MB_PTR2LEN(p); 5517 n = MB_PTR2LEN(p + tl); 5518 n += MB_PTR2LEN(p + tl + n); 5519 mch_memmove(p, p + tl, n); 5520 mb_char2bytes(c, p + n); 5521 } 5522 else 5523 { 5524 c = *p; 5525 *p = p[1]; 5526 p[1] = p[2]; 5527 p[2] = c; 5528 } 5529 /* FALLTHROUGH */ 5530 5531 case STATE_REP_INI: 5532 /* Check if matching with REP items from the .aff file would work. 5533 * Quickly skip if: 5534 * - there are no REP items and we are not in the soundfold trie 5535 * - the score is going to be too high anyway 5536 * - already applied a REP item or swapped here */ 5537 if ((lp->lp_replang == NULL && !soundfold) 5538 || sp->ts_score + SCORE_REP >= su->su_maxscore 5539 || sp->ts_fidx < sp->ts_fidxtry) 5540 { 5541 PROF_STORE(sp->ts_state) 5542 sp->ts_state = STATE_FINAL; 5543 break; 5544 } 5545 5546 /* Use the first byte to quickly find the first entry that may 5547 * match. If the index is -1 there is none. */ 5548 if (soundfold) 5549 sp->ts_curi = slang->sl_repsal_first[fword[sp->ts_fidx]]; 5550 else 5551 sp->ts_curi = lp->lp_replang->sl_rep_first[fword[sp->ts_fidx]]; 5552 5553 if (sp->ts_curi < 0) 5554 { 5555 PROF_STORE(sp->ts_state) 5556 sp->ts_state = STATE_FINAL; 5557 break; 5558 } 5559 5560 PROF_STORE(sp->ts_state) 5561 sp->ts_state = STATE_REP; 5562 /* FALLTHROUGH */ 5563 5564 case STATE_REP: 5565 /* Try matching with REP items from the .aff file. For each match 5566 * replace the characters and check if the resulting word is 5567 * valid. */ 5568 p = fword + sp->ts_fidx; 5569 5570 if (soundfold) 5571 gap = &slang->sl_repsal; 5572 else 5573 gap = &lp->lp_replang->sl_rep; 5574 while (sp->ts_curi < gap->ga_len) 5575 { 5576 ftp = (fromto_T *)gap->ga_data + sp->ts_curi++; 5577 if (*ftp->ft_from != *p) 5578 { 5579 /* past possible matching entries */ 5580 sp->ts_curi = gap->ga_len; 5581 break; 5582 } 5583 if (STRNCMP(ftp->ft_from, p, STRLEN(ftp->ft_from)) == 0 5584 && TRY_DEEPER(su, stack, depth, SCORE_REP)) 5585 { 5586 go_deeper(stack, depth, SCORE_REP); 5587 #ifdef DEBUG_TRIEWALK 5588 sprintf(changename[depth], "%.*s-%s: replace %s with %s", 5589 sp->ts_twordlen, tword, fword + sp->ts_fidx, 5590 ftp->ft_from, ftp->ft_to); 5591 #endif 5592 /* Need to undo this afterwards. */ 5593 PROF_STORE(sp->ts_state) 5594 sp->ts_state = STATE_REP_UNDO; 5595 5596 /* Change the "from" to the "to" string. */ 5597 ++depth; 5598 fl = (int)STRLEN(ftp->ft_from); 5599 tl = (int)STRLEN(ftp->ft_to); 5600 if (fl != tl) 5601 { 5602 STRMOVE(p + tl, p + fl); 5603 repextra += tl - fl; 5604 } 5605 mch_memmove(p, ftp->ft_to, tl); 5606 stack[depth].ts_fidxtry = sp->ts_fidx + tl; 5607 stack[depth].ts_tcharlen = 0; 5608 break; 5609 } 5610 } 5611 5612 if (sp->ts_curi >= gap->ga_len && sp->ts_state == STATE_REP) 5613 { 5614 /* No (more) matches. */ 5615 PROF_STORE(sp->ts_state) 5616 sp->ts_state = STATE_FINAL; 5617 } 5618 5619 break; 5620 5621 case STATE_REP_UNDO: 5622 /* Undo a REP replacement and continue with the next one. */ 5623 if (soundfold) 5624 gap = &slang->sl_repsal; 5625 else 5626 gap = &lp->lp_replang->sl_rep; 5627 ftp = (fromto_T *)gap->ga_data + sp->ts_curi - 1; 5628 fl = (int)STRLEN(ftp->ft_from); 5629 tl = (int)STRLEN(ftp->ft_to); 5630 p = fword + sp->ts_fidx; 5631 if (fl != tl) 5632 { 5633 STRMOVE(p + fl, p + tl); 5634 repextra -= tl - fl; 5635 } 5636 mch_memmove(p, ftp->ft_from, fl); 5637 PROF_STORE(sp->ts_state) 5638 sp->ts_state = STATE_REP; 5639 break; 5640 5641 default: 5642 /* Did all possible states at this level, go up one level. */ 5643 --depth; 5644 5645 if (depth >= 0 && stack[depth].ts_prefixdepth == PFD_PREFIXTREE) 5646 { 5647 /* Continue in or go back to the prefix tree. */ 5648 byts = pbyts; 5649 idxs = pidxs; 5650 } 5651 5652 /* Don't check for CTRL-C too often, it takes time. */ 5653 if (--breakcheckcount == 0) 5654 { 5655 ui_breakcheck(); 5656 breakcheckcount = 1000; 5657 } 5658 } 5659 } 5660 } 5661 5662 5663 /* 5664 * Go one level deeper in the tree. 5665 */ 5666 static void 5667 go_deeper(trystate_T *stack, int depth, int score_add) 5668 { 5669 stack[depth + 1] = stack[depth]; 5670 stack[depth + 1].ts_state = STATE_START; 5671 stack[depth + 1].ts_score = stack[depth].ts_score + score_add; 5672 stack[depth + 1].ts_curi = 1; /* start just after length byte */ 5673 stack[depth + 1].ts_flags = 0; 5674 } 5675 5676 /* 5677 * Case-folding may change the number of bytes: Count nr of chars in 5678 * fword[flen] and return the byte length of that many chars in "word". 5679 */ 5680 static int 5681 nofold_len(char_u *fword, int flen, char_u *word) 5682 { 5683 char_u *p; 5684 int i = 0; 5685 5686 for (p = fword; p < fword + flen; MB_PTR_ADV(p)) 5687 ++i; 5688 for (p = word; i > 0; MB_PTR_ADV(p)) 5689 --i; 5690 return (int)(p - word); 5691 } 5692 5693 /* 5694 * "fword" is a good word with case folded. Find the matching keep-case 5695 * words and put it in "kword". 5696 * Theoretically there could be several keep-case words that result in the 5697 * same case-folded word, but we only find one... 5698 */ 5699 static void 5700 find_keepcap_word(slang_T *slang, char_u *fword, char_u *kword) 5701 { 5702 char_u uword[MAXWLEN]; /* "fword" in upper-case */ 5703 int depth; 5704 idx_T tryidx; 5705 5706 /* The following arrays are used at each depth in the tree. */ 5707 idx_T arridx[MAXWLEN]; 5708 int round[MAXWLEN]; 5709 int fwordidx[MAXWLEN]; 5710 int uwordidx[MAXWLEN]; 5711 int kwordlen[MAXWLEN]; 5712 5713 int flen, ulen; 5714 int l; 5715 int len; 5716 int c; 5717 idx_T lo, hi, m; 5718 char_u *p; 5719 char_u *byts = slang->sl_kbyts; /* array with bytes of the words */ 5720 idx_T *idxs = slang->sl_kidxs; /* array with indexes */ 5721 5722 if (byts == NULL) 5723 { 5724 /* array is empty: "cannot happen" */ 5725 *kword = NUL; 5726 return; 5727 } 5728 5729 /* Make an all-cap version of "fword". */ 5730 allcap_copy(fword, uword); 5731 5732 /* 5733 * Each character needs to be tried both case-folded and upper-case. 5734 * All this gets very complicated if we keep in mind that changing case 5735 * may change the byte length of a multi-byte character... 5736 */ 5737 depth = 0; 5738 arridx[0] = 0; 5739 round[0] = 0; 5740 fwordidx[0] = 0; 5741 uwordidx[0] = 0; 5742 kwordlen[0] = 0; 5743 while (depth >= 0) 5744 { 5745 if (fword[fwordidx[depth]] == NUL) 5746 { 5747 /* We are at the end of "fword". If the tree allows a word to end 5748 * here we have found a match. */ 5749 if (byts[arridx[depth] + 1] == 0) 5750 { 5751 kword[kwordlen[depth]] = NUL; 5752 return; 5753 } 5754 5755 /* kword is getting too long, continue one level up */ 5756 --depth; 5757 } 5758 else if (++round[depth] > 2) 5759 { 5760 /* tried both fold-case and upper-case character, continue one 5761 * level up */ 5762 --depth; 5763 } 5764 else 5765 { 5766 /* 5767 * round[depth] == 1: Try using the folded-case character. 5768 * round[depth] == 2: Try using the upper-case character. 5769 */ 5770 if (has_mbyte) 5771 { 5772 flen = MB_CPTR2LEN(fword + fwordidx[depth]); 5773 ulen = MB_CPTR2LEN(uword + uwordidx[depth]); 5774 } 5775 else 5776 ulen = flen = 1; 5777 if (round[depth] == 1) 5778 { 5779 p = fword + fwordidx[depth]; 5780 l = flen; 5781 } 5782 else 5783 { 5784 p = uword + uwordidx[depth]; 5785 l = ulen; 5786 } 5787 5788 for (tryidx = arridx[depth]; l > 0; --l) 5789 { 5790 /* Perform a binary search in the list of accepted bytes. */ 5791 len = byts[tryidx++]; 5792 c = *p++; 5793 lo = tryidx; 5794 hi = tryidx + len - 1; 5795 while (lo < hi) 5796 { 5797 m = (lo + hi) / 2; 5798 if (byts[m] > c) 5799 hi = m - 1; 5800 else if (byts[m] < c) 5801 lo = m + 1; 5802 else 5803 { 5804 lo = hi = m; 5805 break; 5806 } 5807 } 5808 5809 /* Stop if there is no matching byte. */ 5810 if (hi < lo || byts[lo] != c) 5811 break; 5812 5813 /* Continue at the child (if there is one). */ 5814 tryidx = idxs[lo]; 5815 } 5816 5817 if (l == 0) 5818 { 5819 /* 5820 * Found the matching char. Copy it to "kword" and go a 5821 * level deeper. 5822 */ 5823 if (round[depth] == 1) 5824 { 5825 STRNCPY(kword + kwordlen[depth], fword + fwordidx[depth], 5826 flen); 5827 kwordlen[depth + 1] = kwordlen[depth] + flen; 5828 } 5829 else 5830 { 5831 STRNCPY(kword + kwordlen[depth], uword + uwordidx[depth], 5832 ulen); 5833 kwordlen[depth + 1] = kwordlen[depth] + ulen; 5834 } 5835 fwordidx[depth + 1] = fwordidx[depth] + flen; 5836 uwordidx[depth + 1] = uwordidx[depth] + ulen; 5837 5838 ++depth; 5839 arridx[depth] = tryidx; 5840 round[depth] = 0; 5841 } 5842 } 5843 } 5844 5845 /* Didn't find it: "cannot happen". */ 5846 *kword = NUL; 5847 } 5848 5849 /* 5850 * Compute the sound-a-like score for suggestions in su->su_ga and add them to 5851 * su->su_sga. 5852 */ 5853 static void 5854 score_comp_sal(suginfo_T *su) 5855 { 5856 langp_T *lp; 5857 char_u badsound[MAXWLEN]; 5858 int i; 5859 suggest_T *stp; 5860 suggest_T *sstp; 5861 int score; 5862 int lpi; 5863 5864 if (ga_grow(&su->su_sga, su->su_ga.ga_len) == FAIL) 5865 return; 5866 5867 /* Use the sound-folding of the first language that supports it. */ 5868 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 5869 { 5870 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 5871 if (lp->lp_slang->sl_sal.ga_len > 0) 5872 { 5873 /* soundfold the bad word */ 5874 spell_soundfold(lp->lp_slang, su->su_fbadword, TRUE, badsound); 5875 5876 for (i = 0; i < su->su_ga.ga_len; ++i) 5877 { 5878 stp = &SUG(su->su_ga, i); 5879 5880 /* Case-fold the suggested word, sound-fold it and compute the 5881 * sound-a-like score. */ 5882 score = stp_sal_score(stp, su, lp->lp_slang, badsound); 5883 if (score < SCORE_MAXMAX) 5884 { 5885 /* Add the suggestion. */ 5886 sstp = &SUG(su->su_sga, su->su_sga.ga_len); 5887 sstp->st_word = vim_strsave(stp->st_word); 5888 if (sstp->st_word != NULL) 5889 { 5890 sstp->st_wordlen = stp->st_wordlen; 5891 sstp->st_score = score; 5892 sstp->st_altscore = 0; 5893 sstp->st_orglen = stp->st_orglen; 5894 ++su->su_sga.ga_len; 5895 } 5896 } 5897 } 5898 break; 5899 } 5900 } 5901 } 5902 5903 /* 5904 * Combine the list of suggestions in su->su_ga and su->su_sga. 5905 * They are entwined. 5906 */ 5907 static void 5908 score_combine(suginfo_T *su) 5909 { 5910 int i; 5911 int j; 5912 garray_T ga; 5913 garray_T *gap; 5914 langp_T *lp; 5915 suggest_T *stp; 5916 char_u *p; 5917 char_u badsound[MAXWLEN]; 5918 int round; 5919 int lpi; 5920 slang_T *slang = NULL; 5921 5922 /* Add the alternate score to su_ga. */ 5923 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 5924 { 5925 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 5926 if (lp->lp_slang->sl_sal.ga_len > 0) 5927 { 5928 /* soundfold the bad word */ 5929 slang = lp->lp_slang; 5930 spell_soundfold(slang, su->su_fbadword, TRUE, badsound); 5931 5932 for (i = 0; i < su->su_ga.ga_len; ++i) 5933 { 5934 stp = &SUG(su->su_ga, i); 5935 stp->st_altscore = stp_sal_score(stp, su, slang, badsound); 5936 if (stp->st_altscore == SCORE_MAXMAX) 5937 stp->st_score = (stp->st_score * 3 + SCORE_BIG) / 4; 5938 else 5939 stp->st_score = (stp->st_score * 3 5940 + stp->st_altscore) / 4; 5941 stp->st_salscore = FALSE; 5942 } 5943 break; 5944 } 5945 } 5946 5947 if (slang == NULL) /* Using "double" without sound folding. */ 5948 { 5949 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, 5950 su->su_maxcount); 5951 return; 5952 } 5953 5954 /* Add the alternate score to su_sga. */ 5955 for (i = 0; i < su->su_sga.ga_len; ++i) 5956 { 5957 stp = &SUG(su->su_sga, i); 5958 stp->st_altscore = spell_edit_score(slang, 5959 su->su_badword, stp->st_word); 5960 if (stp->st_score == SCORE_MAXMAX) 5961 stp->st_score = (SCORE_BIG * 7 + stp->st_altscore) / 8; 5962 else 5963 stp->st_score = (stp->st_score * 7 + stp->st_altscore) / 8; 5964 stp->st_salscore = TRUE; 5965 } 5966 5967 /* Remove bad suggestions, sort the suggestions and truncate at "maxcount" 5968 * for both lists. */ 5969 check_suggestions(su, &su->su_ga); 5970 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); 5971 check_suggestions(su, &su->su_sga); 5972 (void)cleanup_suggestions(&su->su_sga, su->su_maxscore, su->su_maxcount); 5973 5974 ga_init2(&ga, (int)sizeof(suginfo_T), 1); 5975 if (ga_grow(&ga, su->su_ga.ga_len + su->su_sga.ga_len) == FAIL) 5976 return; 5977 5978 stp = &SUG(ga, 0); 5979 for (i = 0; i < su->su_ga.ga_len || i < su->su_sga.ga_len; ++i) 5980 { 5981 /* round 1: get a suggestion from su_ga 5982 * round 2: get a suggestion from su_sga */ 5983 for (round = 1; round <= 2; ++round) 5984 { 5985 gap = round == 1 ? &su->su_ga : &su->su_sga; 5986 if (i < gap->ga_len) 5987 { 5988 /* Don't add a word if it's already there. */ 5989 p = SUG(*gap, i).st_word; 5990 for (j = 0; j < ga.ga_len; ++j) 5991 if (STRCMP(stp[j].st_word, p) == 0) 5992 break; 5993 if (j == ga.ga_len) 5994 stp[ga.ga_len++] = SUG(*gap, i); 5995 else 5996 vim_free(p); 5997 } 5998 } 5999 } 6000 6001 ga_clear(&su->su_ga); 6002 ga_clear(&su->su_sga); 6003 6004 /* Truncate the list to the number of suggestions that will be displayed. */ 6005 if (ga.ga_len > su->su_maxcount) 6006 { 6007 for (i = su->su_maxcount; i < ga.ga_len; ++i) 6008 vim_free(stp[i].st_word); 6009 ga.ga_len = su->su_maxcount; 6010 } 6011 6012 su->su_ga = ga; 6013 } 6014 6015 /* 6016 * For the goodword in "stp" compute the soundalike score compared to the 6017 * badword. 6018 */ 6019 static int 6020 stp_sal_score( 6021 suggest_T *stp, 6022 suginfo_T *su, 6023 slang_T *slang, 6024 char_u *badsound) /* sound-folded badword */ 6025 { 6026 char_u *p; 6027 char_u *pbad; 6028 char_u *pgood; 6029 char_u badsound2[MAXWLEN]; 6030 char_u fword[MAXWLEN]; 6031 char_u goodsound[MAXWLEN]; 6032 char_u goodword[MAXWLEN]; 6033 int lendiff; 6034 6035 lendiff = (int)(su->su_badlen - stp->st_orglen); 6036 if (lendiff >= 0) 6037 pbad = badsound; 6038 else 6039 { 6040 /* soundfold the bad word with more characters following */ 6041 (void)spell_casefold(su->su_badptr, stp->st_orglen, fword, MAXWLEN); 6042 6043 /* When joining two words the sound often changes a lot. E.g., "t he" 6044 * sounds like "t h" while "the" sounds like "@". Avoid that by 6045 * removing the space. Don't do it when the good word also contains a 6046 * space. */ 6047 if (VIM_ISWHITE(su->su_badptr[su->su_badlen]) 6048 && *skiptowhite(stp->st_word) == NUL) 6049 for (p = fword; *(p = skiptowhite(p)) != NUL; ) 6050 STRMOVE(p, p + 1); 6051 6052 spell_soundfold(slang, fword, TRUE, badsound2); 6053 pbad = badsound2; 6054 } 6055 6056 if (lendiff > 0 && stp->st_wordlen + lendiff < MAXWLEN) 6057 { 6058 /* Add part of the bad word to the good word, so that we soundfold 6059 * what replaces the bad word. */ 6060 STRCPY(goodword, stp->st_word); 6061 vim_strncpy(goodword + stp->st_wordlen, 6062 su->su_badptr + su->su_badlen - lendiff, lendiff); 6063 pgood = goodword; 6064 } 6065 else 6066 pgood = stp->st_word; 6067 6068 /* Sound-fold the word and compute the score for the difference. */ 6069 spell_soundfold(slang, pgood, FALSE, goodsound); 6070 6071 return soundalike_score(goodsound, pbad); 6072 } 6073 6074 /* structure used to store soundfolded words that add_sound_suggest() has 6075 * handled already. */ 6076 typedef struct 6077 { 6078 short sft_score; /* lowest score used */ 6079 char_u sft_word[1]; /* soundfolded word, actually longer */ 6080 } sftword_T; 6081 6082 static sftword_T dumsft; 6083 #define HIKEY2SFT(p) ((sftword_T *)(p - (dumsft.sft_word - (char_u *)&dumsft))) 6084 #define HI2SFT(hi) HIKEY2SFT((hi)->hi_key) 6085 6086 /* 6087 * Prepare for calling suggest_try_soundalike(). 6088 */ 6089 static void 6090 suggest_try_soundalike_prep(void) 6091 { 6092 langp_T *lp; 6093 int lpi; 6094 slang_T *slang; 6095 6096 /* Do this for all languages that support sound folding and for which a 6097 * .sug file has been loaded. */ 6098 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 6099 { 6100 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 6101 slang = lp->lp_slang; 6102 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL) 6103 /* prepare the hashtable used by add_sound_suggest() */ 6104 hash_init(&slang->sl_sounddone); 6105 } 6106 } 6107 6108 /* 6109 * Find suggestions by comparing the word in a sound-a-like form. 6110 * Note: This doesn't support postponed prefixes. 6111 */ 6112 static void 6113 suggest_try_soundalike(suginfo_T *su) 6114 { 6115 char_u salword[MAXWLEN]; 6116 langp_T *lp; 6117 int lpi; 6118 slang_T *slang; 6119 6120 /* Do this for all languages that support sound folding and for which a 6121 * .sug file has been loaded. */ 6122 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 6123 { 6124 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 6125 slang = lp->lp_slang; 6126 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL) 6127 { 6128 /* soundfold the bad word */ 6129 spell_soundfold(slang, su->su_fbadword, TRUE, salword); 6130 6131 /* try all kinds of inserts/deletes/swaps/etc. */ 6132 /* TODO: also soundfold the next words, so that we can try joining 6133 * and splitting */ 6134 #ifdef SUGGEST_PROFILE 6135 prof_init(); 6136 #endif 6137 suggest_trie_walk(su, lp, salword, TRUE); 6138 #ifdef SUGGEST_PROFILE 6139 prof_report("soundalike"); 6140 #endif 6141 } 6142 } 6143 } 6144 6145 /* 6146 * Finish up after calling suggest_try_soundalike(). 6147 */ 6148 static void 6149 suggest_try_soundalike_finish(void) 6150 { 6151 langp_T *lp; 6152 int lpi; 6153 slang_T *slang; 6154 int todo; 6155 hashitem_T *hi; 6156 6157 /* Do this for all languages that support sound folding and for which a 6158 * .sug file has been loaded. */ 6159 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 6160 { 6161 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 6162 slang = lp->lp_slang; 6163 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL) 6164 { 6165 /* Free the info about handled words. */ 6166 todo = (int)slang->sl_sounddone.ht_used; 6167 for (hi = slang->sl_sounddone.ht_array; todo > 0; ++hi) 6168 if (!HASHITEM_EMPTY(hi)) 6169 { 6170 vim_free(HI2SFT(hi)); 6171 --todo; 6172 } 6173 6174 /* Clear the hashtable, it may also be used by another region. */ 6175 hash_clear(&slang->sl_sounddone); 6176 hash_init(&slang->sl_sounddone); 6177 } 6178 } 6179 } 6180 6181 /* 6182 * A match with a soundfolded word is found. Add the good word(s) that 6183 * produce this soundfolded word. 6184 */ 6185 static void 6186 add_sound_suggest( 6187 suginfo_T *su, 6188 char_u *goodword, 6189 int score, /* soundfold score */ 6190 langp_T *lp) 6191 { 6192 slang_T *slang = lp->lp_slang; /* language for sound folding */ 6193 int sfwordnr; 6194 char_u *nrline; 6195 int orgnr; 6196 char_u theword[MAXWLEN]; 6197 int i; 6198 int wlen; 6199 char_u *byts; 6200 idx_T *idxs; 6201 int n; 6202 int wordcount; 6203 int wc; 6204 int goodscore; 6205 hash_T hash; 6206 hashitem_T *hi; 6207 sftword_T *sft; 6208 int bc, gc; 6209 int limit; 6210 6211 /* 6212 * It's very well possible that the same soundfold word is found several 6213 * times with different scores. Since the following is quite slow only do 6214 * the words that have a better score than before. Use a hashtable to 6215 * remember the words that have been done. 6216 */ 6217 hash = hash_hash(goodword); 6218 hi = hash_lookup(&slang->sl_sounddone, goodword, hash); 6219 if (HASHITEM_EMPTY(hi)) 6220 { 6221 sft = (sftword_T *)alloc((unsigned)(sizeof(sftword_T) 6222 + STRLEN(goodword))); 6223 if (sft != NULL) 6224 { 6225 sft->sft_score = score; 6226 STRCPY(sft->sft_word, goodword); 6227 hash_add_item(&slang->sl_sounddone, hi, sft->sft_word, hash); 6228 } 6229 } 6230 else 6231 { 6232 sft = HI2SFT(hi); 6233 if (score >= sft->sft_score) 6234 return; 6235 sft->sft_score = score; 6236 } 6237 6238 /* 6239 * Find the word nr in the soundfold tree. 6240 */ 6241 sfwordnr = soundfold_find(slang, goodword); 6242 if (sfwordnr < 0) 6243 { 6244 internal_error("add_sound_suggest()"); 6245 return; 6246 } 6247 6248 /* 6249 * go over the list of good words that produce this soundfold word 6250 */ 6251 nrline = ml_get_buf(slang->sl_sugbuf, (linenr_T)(sfwordnr + 1), FALSE); 6252 orgnr = 0; 6253 while (*nrline != NUL) 6254 { 6255 /* The wordnr was stored in a minimal nr of bytes as an offset to the 6256 * previous wordnr. */ 6257 orgnr += bytes2offset(&nrline); 6258 6259 byts = slang->sl_fbyts; 6260 idxs = slang->sl_fidxs; 6261 6262 /* Lookup the word "orgnr" one of the two tries. */ 6263 n = 0; 6264 wordcount = 0; 6265 for (wlen = 0; wlen < MAXWLEN - 3; ++wlen) 6266 { 6267 i = 1; 6268 if (wordcount == orgnr && byts[n + 1] == NUL) 6269 break; /* found end of word */ 6270 6271 if (byts[n + 1] == NUL) 6272 ++wordcount; 6273 6274 /* skip over the NUL bytes */ 6275 for ( ; byts[n + i] == NUL; ++i) 6276 if (i > byts[n]) /* safety check */ 6277 { 6278 STRCPY(theword + wlen, "BAD"); 6279 wlen += 3; 6280 goto badword; 6281 } 6282 6283 /* One of the siblings must have the word. */ 6284 for ( ; i < byts[n]; ++i) 6285 { 6286 wc = idxs[idxs[n + i]]; /* nr of words under this byte */ 6287 if (wordcount + wc > orgnr) 6288 break; 6289 wordcount += wc; 6290 } 6291 6292 theword[wlen] = byts[n + i]; 6293 n = idxs[n + i]; 6294 } 6295 badword: 6296 theword[wlen] = NUL; 6297 6298 /* Go over the possible flags and regions. */ 6299 for (; i <= byts[n] && byts[n + i] == NUL; ++i) 6300 { 6301 char_u cword[MAXWLEN]; 6302 char_u *p; 6303 int flags = (int)idxs[n + i]; 6304 6305 /* Skip words with the NOSUGGEST flag */ 6306 if (flags & WF_NOSUGGEST) 6307 continue; 6308 6309 if (flags & WF_KEEPCAP) 6310 { 6311 /* Must find the word in the keep-case tree. */ 6312 find_keepcap_word(slang, theword, cword); 6313 p = cword; 6314 } 6315 else 6316 { 6317 flags |= su->su_badflags; 6318 if ((flags & WF_CAPMASK) != 0) 6319 { 6320 /* Need to fix case according to "flags". */ 6321 make_case_word(theword, cword, flags); 6322 p = cword; 6323 } 6324 else 6325 p = theword; 6326 } 6327 6328 /* Add the suggestion. */ 6329 if (sps_flags & SPS_DOUBLE) 6330 { 6331 /* Add the suggestion if the score isn't too bad. */ 6332 if (score <= su->su_maxscore) 6333 add_suggestion(su, &su->su_sga, p, su->su_badlen, 6334 score, 0, FALSE, slang, FALSE); 6335 } 6336 else 6337 { 6338 /* Add a penalty for words in another region. */ 6339 if ((flags & WF_REGION) 6340 && (((unsigned)flags >> 16) & lp->lp_region) == 0) 6341 goodscore = SCORE_REGION; 6342 else 6343 goodscore = 0; 6344 6345 /* Add a small penalty for changing the first letter from 6346 * lower to upper case. Helps for "tath" -> "Kath", which is 6347 * less common than "tath" -> "path". Don't do it when the 6348 * letter is the same, that has already been counted. */ 6349 gc = PTR2CHAR(p); 6350 if (SPELL_ISUPPER(gc)) 6351 { 6352 bc = PTR2CHAR(su->su_badword); 6353 if (!SPELL_ISUPPER(bc) 6354 && SPELL_TOFOLD(bc) != SPELL_TOFOLD(gc)) 6355 goodscore += SCORE_ICASE / 2; 6356 } 6357 6358 /* Compute the score for the good word. This only does letter 6359 * insert/delete/swap/replace. REP items are not considered, 6360 * which may make the score a bit higher. 6361 * Use a limit for the score to make it work faster. Use 6362 * MAXSCORE(), because RESCORE() will change the score. 6363 * If the limit is very high then the iterative method is 6364 * inefficient, using an array is quicker. */ 6365 limit = MAXSCORE(su->su_sfmaxscore - goodscore, score); 6366 if (limit > SCORE_LIMITMAX) 6367 goodscore += spell_edit_score(slang, su->su_badword, p); 6368 else 6369 goodscore += spell_edit_score_limit(slang, su->su_badword, 6370 p, limit); 6371 6372 /* When going over the limit don't bother to do the rest. */ 6373 if (goodscore < SCORE_MAXMAX) 6374 { 6375 /* Give a bonus to words seen before. */ 6376 goodscore = score_wordcount_adj(slang, goodscore, p, FALSE); 6377 6378 /* Add the suggestion if the score isn't too bad. */ 6379 goodscore = RESCORE(goodscore, score); 6380 if (goodscore <= su->su_sfmaxscore) 6381 add_suggestion(su, &su->su_ga, p, su->su_badlen, 6382 goodscore, score, TRUE, slang, TRUE); 6383 } 6384 } 6385 } 6386 /* smsg("word %s (%d): %s (%d)", sftword, sftnr, theword, orgnr); */ 6387 } 6388 } 6389 6390 /* 6391 * Find word "word" in fold-case tree for "slang" and return the word number. 6392 */ 6393 static int 6394 soundfold_find(slang_T *slang, char_u *word) 6395 { 6396 idx_T arridx = 0; 6397 int len; 6398 int wlen = 0; 6399 int c; 6400 char_u *ptr = word; 6401 char_u *byts; 6402 idx_T *idxs; 6403 int wordnr = 0; 6404 6405 byts = slang->sl_sbyts; 6406 idxs = slang->sl_sidxs; 6407 6408 for (;;) 6409 { 6410 /* First byte is the number of possible bytes. */ 6411 len = byts[arridx++]; 6412 6413 /* If the first possible byte is a zero the word could end here. 6414 * If the word ends we found the word. If not skip the NUL bytes. */ 6415 c = ptr[wlen]; 6416 if (byts[arridx] == NUL) 6417 { 6418 if (c == NUL) 6419 break; 6420 6421 /* Skip over the zeros, there can be several. */ 6422 while (len > 0 && byts[arridx] == NUL) 6423 { 6424 ++arridx; 6425 --len; 6426 } 6427 if (len == 0) 6428 return -1; /* no children, word should have ended here */ 6429 ++wordnr; 6430 } 6431 6432 /* If the word ends we didn't find it. */ 6433 if (c == NUL) 6434 return -1; 6435 6436 /* Perform a binary search in the list of accepted bytes. */ 6437 if (c == TAB) /* <Tab> is handled like <Space> */ 6438 c = ' '; 6439 while (byts[arridx] < c) 6440 { 6441 /* The word count is in the first idxs[] entry of the child. */ 6442 wordnr += idxs[idxs[arridx]]; 6443 ++arridx; 6444 if (--len == 0) /* end of the bytes, didn't find it */ 6445 return -1; 6446 } 6447 if (byts[arridx] != c) /* didn't find the byte */ 6448 return -1; 6449 6450 /* Continue at the child (if there is one). */ 6451 arridx = idxs[arridx]; 6452 ++wlen; 6453 6454 /* One space in the good word may stand for several spaces in the 6455 * checked word. */ 6456 if (c == ' ') 6457 while (ptr[wlen] == ' ' || ptr[wlen] == TAB) 6458 ++wlen; 6459 } 6460 6461 return wordnr; 6462 } 6463 6464 /* 6465 * Copy "fword" to "cword", fixing case according to "flags". 6466 */ 6467 static void 6468 make_case_word(char_u *fword, char_u *cword, int flags) 6469 { 6470 if (flags & WF_ALLCAP) 6471 /* Make it all upper-case */ 6472 allcap_copy(fword, cword); 6473 else if (flags & WF_ONECAP) 6474 /* Make the first letter upper-case */ 6475 onecap_copy(fword, cword, TRUE); 6476 else 6477 /* Use goodword as-is. */ 6478 STRCPY(cword, fword); 6479 } 6480 6481 6482 /* 6483 * Return TRUE if "c1" and "c2" are similar characters according to the MAP 6484 * lines in the .aff file. 6485 */ 6486 static int 6487 similar_chars(slang_T *slang, int c1, int c2) 6488 { 6489 int m1, m2; 6490 char_u buf[MB_MAXBYTES + 1]; 6491 hashitem_T *hi; 6492 6493 if (c1 >= 256) 6494 { 6495 buf[mb_char2bytes(c1, buf)] = 0; 6496 hi = hash_find(&slang->sl_map_hash, buf); 6497 if (HASHITEM_EMPTY(hi)) 6498 m1 = 0; 6499 else 6500 m1 = mb_ptr2char(hi->hi_key + STRLEN(hi->hi_key) + 1); 6501 } 6502 else 6503 m1 = slang->sl_map_array[c1]; 6504 if (m1 == 0) 6505 return FALSE; 6506 6507 6508 if (c2 >= 256) 6509 { 6510 buf[mb_char2bytes(c2, buf)] = 0; 6511 hi = hash_find(&slang->sl_map_hash, buf); 6512 if (HASHITEM_EMPTY(hi)) 6513 m2 = 0; 6514 else 6515 m2 = mb_ptr2char(hi->hi_key + STRLEN(hi->hi_key) + 1); 6516 } 6517 else 6518 m2 = slang->sl_map_array[c2]; 6519 6520 return m1 == m2; 6521 } 6522 6523 /* 6524 * Add a suggestion to the list of suggestions. 6525 * For a suggestion that is already in the list the lowest score is remembered. 6526 */ 6527 static void 6528 add_suggestion( 6529 suginfo_T *su, 6530 garray_T *gap, /* either su_ga or su_sga */ 6531 char_u *goodword, 6532 int badlenarg, /* len of bad word replaced with "goodword" */ 6533 int score, 6534 int altscore, 6535 int had_bonus, /* value for st_had_bonus */ 6536 slang_T *slang, /* language for sound folding */ 6537 int maxsf) /* su_maxscore applies to soundfold score, 6538 su_sfmaxscore to the total score. */ 6539 { 6540 int goodlen; /* len of goodword changed */ 6541 int badlen; /* len of bad word changed */ 6542 suggest_T *stp; 6543 suggest_T new_sug; 6544 int i; 6545 char_u *pgood, *pbad; 6546 6547 /* Minimize "badlen" for consistency. Avoids that changing "the the" to 6548 * "thee the" is added next to changing the first "the" the "thee". */ 6549 pgood = goodword + STRLEN(goodword); 6550 pbad = su->su_badptr + badlenarg; 6551 for (;;) 6552 { 6553 goodlen = (int)(pgood - goodword); 6554 badlen = (int)(pbad - su->su_badptr); 6555 if (goodlen <= 0 || badlen <= 0) 6556 break; 6557 MB_PTR_BACK(goodword, pgood); 6558 MB_PTR_BACK(su->su_badptr, pbad); 6559 if (has_mbyte) 6560 { 6561 if (mb_ptr2char(pgood) != mb_ptr2char(pbad)) 6562 break; 6563 } 6564 else if (*pgood != *pbad) 6565 break; 6566 } 6567 6568 if (badlen == 0 && goodlen == 0) 6569 /* goodword doesn't change anything; may happen for "the the" changing 6570 * the first "the" to itself. */ 6571 return; 6572 6573 if (gap->ga_len == 0) 6574 i = -1; 6575 else 6576 { 6577 /* Check if the word is already there. Also check the length that is 6578 * being replaced "thes," -> "these" is a different suggestion from 6579 * "thes" -> "these". */ 6580 stp = &SUG(*gap, 0); 6581 for (i = gap->ga_len; --i >= 0; ++stp) 6582 if (stp->st_wordlen == goodlen 6583 && stp->st_orglen == badlen 6584 && STRNCMP(stp->st_word, goodword, goodlen) == 0) 6585 { 6586 /* 6587 * Found it. Remember the word with the lowest score. 6588 */ 6589 if (stp->st_slang == NULL) 6590 stp->st_slang = slang; 6591 6592 new_sug.st_score = score; 6593 new_sug.st_altscore = altscore; 6594 new_sug.st_had_bonus = had_bonus; 6595 6596 if (stp->st_had_bonus != had_bonus) 6597 { 6598 /* Only one of the two had the soundalike score computed. 6599 * Need to do that for the other one now, otherwise the 6600 * scores can't be compared. This happens because 6601 * suggest_try_change() doesn't compute the soundalike 6602 * word to keep it fast, while some special methods set 6603 * the soundalike score to zero. */ 6604 if (had_bonus) 6605 rescore_one(su, stp); 6606 else 6607 { 6608 new_sug.st_word = stp->st_word; 6609 new_sug.st_wordlen = stp->st_wordlen; 6610 new_sug.st_slang = stp->st_slang; 6611 new_sug.st_orglen = badlen; 6612 rescore_one(su, &new_sug); 6613 } 6614 } 6615 6616 if (stp->st_score > new_sug.st_score) 6617 { 6618 stp->st_score = new_sug.st_score; 6619 stp->st_altscore = new_sug.st_altscore; 6620 stp->st_had_bonus = new_sug.st_had_bonus; 6621 } 6622 break; 6623 } 6624 } 6625 6626 if (i < 0 && ga_grow(gap, 1) == OK) 6627 { 6628 /* Add a suggestion. */ 6629 stp = &SUG(*gap, gap->ga_len); 6630 stp->st_word = vim_strnsave(goodword, goodlen); 6631 if (stp->st_word != NULL) 6632 { 6633 stp->st_wordlen = goodlen; 6634 stp->st_score = score; 6635 stp->st_altscore = altscore; 6636 stp->st_had_bonus = had_bonus; 6637 stp->st_orglen = badlen; 6638 stp->st_slang = slang; 6639 ++gap->ga_len; 6640 6641 /* If we have too many suggestions now, sort the list and keep 6642 * the best suggestions. */ 6643 if (gap->ga_len > SUG_MAX_COUNT(su)) 6644 { 6645 if (maxsf) 6646 su->su_sfmaxscore = cleanup_suggestions(gap, 6647 su->su_sfmaxscore, SUG_CLEAN_COUNT(su)); 6648 else 6649 su->su_maxscore = cleanup_suggestions(gap, 6650 su->su_maxscore, SUG_CLEAN_COUNT(su)); 6651 } 6652 } 6653 } 6654 } 6655 6656 /* 6657 * Suggestions may in fact be flagged as errors. Esp. for banned words and 6658 * for split words, such as "the the". Remove these from the list here. 6659 */ 6660 static void 6661 check_suggestions( 6662 suginfo_T *su, 6663 garray_T *gap) /* either su_ga or su_sga */ 6664 { 6665 suggest_T *stp; 6666 int i; 6667 char_u longword[MAXWLEN + 1]; 6668 int len; 6669 hlf_T attr; 6670 6671 stp = &SUG(*gap, 0); 6672 for (i = gap->ga_len - 1; i >= 0; --i) 6673 { 6674 /* Need to append what follows to check for "the the". */ 6675 vim_strncpy(longword, stp[i].st_word, MAXWLEN); 6676 len = stp[i].st_wordlen; 6677 vim_strncpy(longword + len, su->su_badptr + stp[i].st_orglen, 6678 MAXWLEN - len); 6679 attr = HLF_COUNT; 6680 (void)spell_check(curwin, longword, &attr, NULL, FALSE); 6681 if (attr != HLF_COUNT) 6682 { 6683 /* Remove this entry. */ 6684 vim_free(stp[i].st_word); 6685 --gap->ga_len; 6686 if (i < gap->ga_len) 6687 mch_memmove(stp + i, stp + i + 1, 6688 sizeof(suggest_T) * (gap->ga_len - i)); 6689 } 6690 } 6691 } 6692 6693 6694 /* 6695 * Add a word to be banned. 6696 */ 6697 static void 6698 add_banned( 6699 suginfo_T *su, 6700 char_u *word) 6701 { 6702 char_u *s; 6703 hash_T hash; 6704 hashitem_T *hi; 6705 6706 hash = hash_hash(word); 6707 hi = hash_lookup(&su->su_banned, word, hash); 6708 if (HASHITEM_EMPTY(hi)) 6709 { 6710 s = vim_strsave(word); 6711 if (s != NULL) 6712 hash_add_item(&su->su_banned, hi, s, hash); 6713 } 6714 } 6715 6716 /* 6717 * Recompute the score for all suggestions if sound-folding is possible. This 6718 * is slow, thus only done for the final results. 6719 */ 6720 static void 6721 rescore_suggestions(suginfo_T *su) 6722 { 6723 int i; 6724 6725 if (su->su_sallang != NULL) 6726 for (i = 0; i < su->su_ga.ga_len; ++i) 6727 rescore_one(su, &SUG(su->su_ga, i)); 6728 } 6729 6730 /* 6731 * Recompute the score for one suggestion if sound-folding is possible. 6732 */ 6733 static void 6734 rescore_one(suginfo_T *su, suggest_T *stp) 6735 { 6736 slang_T *slang = stp->st_slang; 6737 char_u sal_badword[MAXWLEN]; 6738 char_u *p; 6739 6740 /* Only rescore suggestions that have no sal score yet and do have a 6741 * language. */ 6742 if (slang != NULL && slang->sl_sal.ga_len > 0 && !stp->st_had_bonus) 6743 { 6744 if (slang == su->su_sallang) 6745 p = su->su_sal_badword; 6746 else 6747 { 6748 spell_soundfold(slang, su->su_fbadword, TRUE, sal_badword); 6749 p = sal_badword; 6750 } 6751 6752 stp->st_altscore = stp_sal_score(stp, su, slang, p); 6753 if (stp->st_altscore == SCORE_MAXMAX) 6754 stp->st_altscore = SCORE_BIG; 6755 stp->st_score = RESCORE(stp->st_score, stp->st_altscore); 6756 stp->st_had_bonus = TRUE; 6757 } 6758 } 6759 6760 static int 6761 #ifdef __BORLANDC__ 6762 _RTLENTRYF 6763 #endif 6764 sug_compare(const void *s1, const void *s2); 6765 6766 /* 6767 * Function given to qsort() to sort the suggestions on st_score. 6768 * First on "st_score", then "st_altscore" then alphabetically. 6769 */ 6770 static int 6771 #ifdef __BORLANDC__ 6772 _RTLENTRYF 6773 #endif 6774 sug_compare(const void *s1, const void *s2) 6775 { 6776 suggest_T *p1 = (suggest_T *)s1; 6777 suggest_T *p2 = (suggest_T *)s2; 6778 int n = p1->st_score - p2->st_score; 6779 6780 if (n == 0) 6781 { 6782 n = p1->st_altscore - p2->st_altscore; 6783 if (n == 0) 6784 n = STRICMP(p1->st_word, p2->st_word); 6785 } 6786 return n; 6787 } 6788 6789 /* 6790 * Cleanup the suggestions: 6791 * - Sort on score. 6792 * - Remove words that won't be displayed. 6793 * Returns the maximum score in the list or "maxscore" unmodified. 6794 */ 6795 static int 6796 cleanup_suggestions( 6797 garray_T *gap, 6798 int maxscore, 6799 int keep) /* nr of suggestions to keep */ 6800 { 6801 suggest_T *stp = &SUG(*gap, 0); 6802 int i; 6803 6804 /* Sort the list. */ 6805 qsort(gap->ga_data, (size_t)gap->ga_len, sizeof(suggest_T), sug_compare); 6806 6807 /* Truncate the list to the number of suggestions that will be displayed. */ 6808 if (gap->ga_len > keep) 6809 { 6810 for (i = keep; i < gap->ga_len; ++i) 6811 vim_free(stp[i].st_word); 6812 gap->ga_len = keep; 6813 return stp[keep - 1].st_score; 6814 } 6815 return maxscore; 6816 } 6817 6818 #if defined(FEAT_EVAL) || defined(PROTO) 6819 /* 6820 * Soundfold a string, for soundfold(). 6821 * Result is in allocated memory, NULL for an error. 6822 */ 6823 char_u * 6824 eval_soundfold(char_u *word) 6825 { 6826 langp_T *lp; 6827 char_u sound[MAXWLEN]; 6828 int lpi; 6829 6830 if (curwin->w_p_spell && *curwin->w_s->b_p_spl != NUL) 6831 /* Use the sound-folding of the first language that supports it. */ 6832 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 6833 { 6834 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 6835 if (lp->lp_slang->sl_sal.ga_len > 0) 6836 { 6837 /* soundfold the word */ 6838 spell_soundfold(lp->lp_slang, word, FALSE, sound); 6839 return vim_strsave(sound); 6840 } 6841 } 6842 6843 /* No language with sound folding, return word as-is. */ 6844 return vim_strsave(word); 6845 } 6846 #endif 6847 6848 /* 6849 * Turn "inword" into its sound-a-like equivalent in "res[MAXWLEN]". 6850 * 6851 * There are many ways to turn a word into a sound-a-like representation. The 6852 * oldest is Soundex (1918!). A nice overview can be found in "Approximate 6853 * swedish name matching - survey and test of different algorithms" by Klas 6854 * Erikson. 6855 * 6856 * We support two methods: 6857 * 1. SOFOFROM/SOFOTO do a simple character mapping. 6858 * 2. SAL items define a more advanced sound-folding (and much slower). 6859 */ 6860 void 6861 spell_soundfold( 6862 slang_T *slang, 6863 char_u *inword, 6864 int folded, /* "inword" is already case-folded */ 6865 char_u *res) 6866 { 6867 char_u fword[MAXWLEN]; 6868 char_u *word; 6869 6870 if (slang->sl_sofo) 6871 /* SOFOFROM and SOFOTO used */ 6872 spell_soundfold_sofo(slang, inword, res); 6873 else 6874 { 6875 /* SAL items used. Requires the word to be case-folded. */ 6876 if (folded) 6877 word = inword; 6878 else 6879 { 6880 (void)spell_casefold(inword, (int)STRLEN(inword), fword, MAXWLEN); 6881 word = fword; 6882 } 6883 6884 if (has_mbyte) 6885 spell_soundfold_wsal(slang, word, res); 6886 else 6887 spell_soundfold_sal(slang, word, res); 6888 } 6889 } 6890 6891 /* 6892 * Perform sound folding of "inword" into "res" according to SOFOFROM and 6893 * SOFOTO lines. 6894 */ 6895 static void 6896 spell_soundfold_sofo(slang_T *slang, char_u *inword, char_u *res) 6897 { 6898 char_u *s; 6899 int ri = 0; 6900 int c; 6901 6902 if (has_mbyte) 6903 { 6904 int prevc = 0; 6905 int *ip; 6906 6907 /* The sl_sal_first[] table contains the translation for chars up to 6908 * 255, sl_sal the rest. */ 6909 for (s = inword; *s != NUL; ) 6910 { 6911 c = mb_cptr2char_adv(&s); 6912 if (enc_utf8 ? utf_class(c) == 0 : VIM_ISWHITE(c)) 6913 c = ' '; 6914 else if (c < 256) 6915 c = slang->sl_sal_first[c]; 6916 else 6917 { 6918 ip = ((int **)slang->sl_sal.ga_data)[c & 0xff]; 6919 if (ip == NULL) /* empty list, can't match */ 6920 c = NUL; 6921 else 6922 for (;;) /* find "c" in the list */ 6923 { 6924 if (*ip == 0) /* not found */ 6925 { 6926 c = NUL; 6927 break; 6928 } 6929 if (*ip == c) /* match! */ 6930 { 6931 c = ip[1]; 6932 break; 6933 } 6934 ip += 2; 6935 } 6936 } 6937 6938 if (c != NUL && c != prevc) 6939 { 6940 ri += mb_char2bytes(c, res + ri); 6941 if (ri + MB_MAXBYTES > MAXWLEN) 6942 break; 6943 prevc = c; 6944 } 6945 } 6946 } 6947 else 6948 { 6949 /* The sl_sal_first[] table contains the translation. */ 6950 for (s = inword; (c = *s) != NUL; ++s) 6951 { 6952 if (VIM_ISWHITE(c)) 6953 c = ' '; 6954 else 6955 c = slang->sl_sal_first[c]; 6956 if (c != NUL && (ri == 0 || res[ri - 1] != c)) 6957 res[ri++] = c; 6958 } 6959 } 6960 6961 res[ri] = NUL; 6962 } 6963 6964 static void 6965 spell_soundfold_sal(slang_T *slang, char_u *inword, char_u *res) 6966 { 6967 salitem_T *smp; 6968 char_u word[MAXWLEN]; 6969 char_u *s = inword; 6970 char_u *t; 6971 char_u *pf; 6972 int i, j, z; 6973 int reslen; 6974 int n, k = 0; 6975 int z0; 6976 int k0; 6977 int n0; 6978 int c; 6979 int pri; 6980 int p0 = -333; 6981 int c0; 6982 6983 /* Remove accents, if wanted. We actually remove all non-word characters. 6984 * But keep white space. We need a copy, the word may be changed here. */ 6985 if (slang->sl_rem_accents) 6986 { 6987 t = word; 6988 while (*s != NUL) 6989 { 6990 if (VIM_ISWHITE(*s)) 6991 { 6992 *t++ = ' '; 6993 s = skipwhite(s); 6994 } 6995 else 6996 { 6997 if (spell_iswordp_nmw(s, curwin)) 6998 *t++ = *s; 6999 ++s; 7000 } 7001 } 7002 *t = NUL; 7003 } 7004 else 7005 vim_strncpy(word, s, MAXWLEN - 1); 7006 7007 smp = (salitem_T *)slang->sl_sal.ga_data; 7008 7009 /* 7010 * This comes from Aspell phonet.cpp. Converted from C++ to C. 7011 * Changed to keep spaces. 7012 */ 7013 i = reslen = z = 0; 7014 while ((c = word[i]) != NUL) 7015 { 7016 /* Start with the first rule that has the character in the word. */ 7017 n = slang->sl_sal_first[c]; 7018 z0 = 0; 7019 7020 if (n >= 0) 7021 { 7022 /* check all rules for the same letter */ 7023 for (; (s = smp[n].sm_lead)[0] == c; ++n) 7024 { 7025 /* Quickly skip entries that don't match the word. Most 7026 * entries are less then three chars, optimize for that. */ 7027 k = smp[n].sm_leadlen; 7028 if (k > 1) 7029 { 7030 if (word[i + 1] != s[1]) 7031 continue; 7032 if (k > 2) 7033 { 7034 for (j = 2; j < k; ++j) 7035 if (word[i + j] != s[j]) 7036 break; 7037 if (j < k) 7038 continue; 7039 } 7040 } 7041 7042 if ((pf = smp[n].sm_oneof) != NULL) 7043 { 7044 /* Check for match with one of the chars in "sm_oneof". */ 7045 while (*pf != NUL && *pf != word[i + k]) 7046 ++pf; 7047 if (*pf == NUL) 7048 continue; 7049 ++k; 7050 } 7051 s = smp[n].sm_rules; 7052 pri = 5; /* default priority */ 7053 7054 p0 = *s; 7055 k0 = k; 7056 while (*s == '-' && k > 1) 7057 { 7058 k--; 7059 s++; 7060 } 7061 if (*s == '<') 7062 s++; 7063 if (VIM_ISDIGIT(*s)) 7064 { 7065 /* determine priority */ 7066 pri = *s - '0'; 7067 s++; 7068 } 7069 if (*s == '^' && *(s + 1) == '^') 7070 s++; 7071 7072 if (*s == NUL 7073 || (*s == '^' 7074 && (i == 0 || !(word[i - 1] == ' ' 7075 || spell_iswordp(word + i - 1, curwin))) 7076 && (*(s + 1) != '$' 7077 || (!spell_iswordp(word + i + k0, curwin)))) 7078 || (*s == '$' && i > 0 7079 && spell_iswordp(word + i - 1, curwin) 7080 && (!spell_iswordp(word + i + k0, curwin)))) 7081 { 7082 /* search for followup rules, if: */ 7083 /* followup and k > 1 and NO '-' in searchstring */ 7084 c0 = word[i + k - 1]; 7085 n0 = slang->sl_sal_first[c0]; 7086 7087 if (slang->sl_followup && k > 1 && n0 >= 0 7088 && p0 != '-' && word[i + k] != NUL) 7089 { 7090 /* test follow-up rule for "word[i + k]" */ 7091 for ( ; (s = smp[n0].sm_lead)[0] == c0; ++n0) 7092 { 7093 /* Quickly skip entries that don't match the word. 7094 * */ 7095 k0 = smp[n0].sm_leadlen; 7096 if (k0 > 1) 7097 { 7098 if (word[i + k] != s[1]) 7099 continue; 7100 if (k0 > 2) 7101 { 7102 pf = word + i + k + 1; 7103 for (j = 2; j < k0; ++j) 7104 if (*pf++ != s[j]) 7105 break; 7106 if (j < k0) 7107 continue; 7108 } 7109 } 7110 k0 += k - 1; 7111 7112 if ((pf = smp[n0].sm_oneof) != NULL) 7113 { 7114 /* Check for match with one of the chars in 7115 * "sm_oneof". */ 7116 while (*pf != NUL && *pf != word[i + k0]) 7117 ++pf; 7118 if (*pf == NUL) 7119 continue; 7120 ++k0; 7121 } 7122 7123 p0 = 5; 7124 s = smp[n0].sm_rules; 7125 while (*s == '-') 7126 { 7127 /* "k0" gets NOT reduced because 7128 * "if (k0 == k)" */ 7129 s++; 7130 } 7131 if (*s == '<') 7132 s++; 7133 if (VIM_ISDIGIT(*s)) 7134 { 7135 p0 = *s - '0'; 7136 s++; 7137 } 7138 7139 if (*s == NUL 7140 /* *s == '^' cuts */ 7141 || (*s == '$' 7142 && !spell_iswordp(word + i + k0, 7143 curwin))) 7144 { 7145 if (k0 == k) 7146 /* this is just a piece of the string */ 7147 continue; 7148 7149 if (p0 < pri) 7150 /* priority too low */ 7151 continue; 7152 /* rule fits; stop search */ 7153 break; 7154 } 7155 } 7156 7157 if (p0 >= pri && smp[n0].sm_lead[0] == c0) 7158 continue; 7159 } 7160 7161 /* replace string */ 7162 s = smp[n].sm_to; 7163 if (s == NULL) 7164 s = (char_u *)""; 7165 pf = smp[n].sm_rules; 7166 p0 = (vim_strchr(pf, '<') != NULL) ? 1 : 0; 7167 if (p0 == 1 && z == 0) 7168 { 7169 /* rule with '<' is used */ 7170 if (reslen > 0 && *s != NUL && (res[reslen - 1] == c 7171 || res[reslen - 1] == *s)) 7172 reslen--; 7173 z0 = 1; 7174 z = 1; 7175 k0 = 0; 7176 while (*s != NUL && word[i + k0] != NUL) 7177 { 7178 word[i + k0] = *s; 7179 k0++; 7180 s++; 7181 } 7182 if (k > k0) 7183 STRMOVE(word + i + k0, word + i + k); 7184 7185 /* new "actual letter" */ 7186 c = word[i]; 7187 } 7188 else 7189 { 7190 /* no '<' rule used */ 7191 i += k - 1; 7192 z = 0; 7193 while (*s != NUL && s[1] != NUL && reslen < MAXWLEN) 7194 { 7195 if (reslen == 0 || res[reslen - 1] != *s) 7196 res[reslen++] = *s; 7197 s++; 7198 } 7199 /* new "actual letter" */ 7200 c = *s; 7201 if (strstr((char *)pf, "^^") != NULL) 7202 { 7203 if (c != NUL) 7204 res[reslen++] = c; 7205 STRMOVE(word, word + i + 1); 7206 i = 0; 7207 z0 = 1; 7208 } 7209 } 7210 break; 7211 } 7212 } 7213 } 7214 else if (VIM_ISWHITE(c)) 7215 { 7216 c = ' '; 7217 k = 1; 7218 } 7219 7220 if (z0 == 0) 7221 { 7222 if (k && !p0 && reslen < MAXWLEN && c != NUL 7223 && (!slang->sl_collapse || reslen == 0 7224 || res[reslen - 1] != c)) 7225 /* condense only double letters */ 7226 res[reslen++] = c; 7227 7228 i++; 7229 z = 0; 7230 k = 0; 7231 } 7232 } 7233 7234 res[reslen] = NUL; 7235 } 7236 7237 /* 7238 * Turn "inword" into its sound-a-like equivalent in "res[MAXWLEN]". 7239 * Multi-byte version of spell_soundfold(). 7240 */ 7241 static void 7242 spell_soundfold_wsal(slang_T *slang, char_u *inword, char_u *res) 7243 { 7244 salitem_T *smp = (salitem_T *)slang->sl_sal.ga_data; 7245 int word[MAXWLEN]; 7246 int wres[MAXWLEN]; 7247 int l; 7248 char_u *s; 7249 int *ws; 7250 char_u *t; 7251 int *pf; 7252 int i, j, z; 7253 int reslen; 7254 int n, k = 0; 7255 int z0; 7256 int k0; 7257 int n0; 7258 int c; 7259 int pri; 7260 int p0 = -333; 7261 int c0; 7262 int did_white = FALSE; 7263 int wordlen; 7264 7265 7266 /* 7267 * Convert the multi-byte string to a wide-character string. 7268 * Remove accents, if wanted. We actually remove all non-word characters. 7269 * But keep white space. 7270 */ 7271 wordlen = 0; 7272 for (s = inword; *s != NUL; ) 7273 { 7274 t = s; 7275 c = mb_cptr2char_adv(&s); 7276 if (slang->sl_rem_accents) 7277 { 7278 if (enc_utf8 ? utf_class(c) == 0 : VIM_ISWHITE(c)) 7279 { 7280 if (did_white) 7281 continue; 7282 c = ' '; 7283 did_white = TRUE; 7284 } 7285 else 7286 { 7287 did_white = FALSE; 7288 if (!spell_iswordp_nmw(t, curwin)) 7289 continue; 7290 } 7291 } 7292 word[wordlen++] = c; 7293 } 7294 word[wordlen] = NUL; 7295 7296 /* 7297 * This algorithm comes from Aspell phonet.cpp. 7298 * Converted from C++ to C. Added support for multi-byte chars. 7299 * Changed to keep spaces. 7300 */ 7301 i = reslen = z = 0; 7302 while ((c = word[i]) != NUL) 7303 { 7304 /* Start with the first rule that has the character in the word. */ 7305 n = slang->sl_sal_first[c & 0xff]; 7306 z0 = 0; 7307 7308 if (n >= 0) 7309 { 7310 /* Check all rules for the same index byte. 7311 * If c is 0x300 need extra check for the end of the array, as 7312 * (c & 0xff) is NUL. */ 7313 for (; ((ws = smp[n].sm_lead_w)[0] & 0xff) == (c & 0xff) 7314 && ws[0] != NUL; ++n) 7315 { 7316 /* Quickly skip entries that don't match the word. Most 7317 * entries are less then three chars, optimize for that. */ 7318 if (c != ws[0]) 7319 continue; 7320 k = smp[n].sm_leadlen; 7321 if (k > 1) 7322 { 7323 if (word[i + 1] != ws[1]) 7324 continue; 7325 if (k > 2) 7326 { 7327 for (j = 2; j < k; ++j) 7328 if (word[i + j] != ws[j]) 7329 break; 7330 if (j < k) 7331 continue; 7332 } 7333 } 7334 7335 if ((pf = smp[n].sm_oneof_w) != NULL) 7336 { 7337 /* Check for match with one of the chars in "sm_oneof". */ 7338 while (*pf != NUL && *pf != word[i + k]) 7339 ++pf; 7340 if (*pf == NUL) 7341 continue; 7342 ++k; 7343 } 7344 s = smp[n].sm_rules; 7345 pri = 5; /* default priority */ 7346 7347 p0 = *s; 7348 k0 = k; 7349 while (*s == '-' && k > 1) 7350 { 7351 k--; 7352 s++; 7353 } 7354 if (*s == '<') 7355 s++; 7356 if (VIM_ISDIGIT(*s)) 7357 { 7358 /* determine priority */ 7359 pri = *s - '0'; 7360 s++; 7361 } 7362 if (*s == '^' && *(s + 1) == '^') 7363 s++; 7364 7365 if (*s == NUL 7366 || (*s == '^' 7367 && (i == 0 || !(word[i - 1] == ' ' 7368 || spell_iswordp_w(word + i - 1, curwin))) 7369 && (*(s + 1) != '$' 7370 || (!spell_iswordp_w(word + i + k0, curwin)))) 7371 || (*s == '$' && i > 0 7372 && spell_iswordp_w(word + i - 1, curwin) 7373 && (!spell_iswordp_w(word + i + k0, curwin)))) 7374 { 7375 /* search for followup rules, if: */ 7376 /* followup and k > 1 and NO '-' in searchstring */ 7377 c0 = word[i + k - 1]; 7378 n0 = slang->sl_sal_first[c0 & 0xff]; 7379 7380 if (slang->sl_followup && k > 1 && n0 >= 0 7381 && p0 != '-' && word[i + k] != NUL) 7382 { 7383 /* Test follow-up rule for "word[i + k]"; loop over 7384 * all entries with the same index byte. */ 7385 for ( ; ((ws = smp[n0].sm_lead_w)[0] & 0xff) 7386 == (c0 & 0xff); ++n0) 7387 { 7388 /* Quickly skip entries that don't match the word. 7389 */ 7390 if (c0 != ws[0]) 7391 continue; 7392 k0 = smp[n0].sm_leadlen; 7393 if (k0 > 1) 7394 { 7395 if (word[i + k] != ws[1]) 7396 continue; 7397 if (k0 > 2) 7398 { 7399 pf = word + i + k + 1; 7400 for (j = 2; j < k0; ++j) 7401 if (*pf++ != ws[j]) 7402 break; 7403 if (j < k0) 7404 continue; 7405 } 7406 } 7407 k0 += k - 1; 7408 7409 if ((pf = smp[n0].sm_oneof_w) != NULL) 7410 { 7411 /* Check for match with one of the chars in 7412 * "sm_oneof". */ 7413 while (*pf != NUL && *pf != word[i + k0]) 7414 ++pf; 7415 if (*pf == NUL) 7416 continue; 7417 ++k0; 7418 } 7419 7420 p0 = 5; 7421 s = smp[n0].sm_rules; 7422 while (*s == '-') 7423 { 7424 /* "k0" gets NOT reduced because 7425 * "if (k0 == k)" */ 7426 s++; 7427 } 7428 if (*s == '<') 7429 s++; 7430 if (VIM_ISDIGIT(*s)) 7431 { 7432 p0 = *s - '0'; 7433 s++; 7434 } 7435 7436 if (*s == NUL 7437 /* *s == '^' cuts */ 7438 || (*s == '$' 7439 && !spell_iswordp_w(word + i + k0, 7440 curwin))) 7441 { 7442 if (k0 == k) 7443 /* this is just a piece of the string */ 7444 continue; 7445 7446 if (p0 < pri) 7447 /* priority too low */ 7448 continue; 7449 /* rule fits; stop search */ 7450 break; 7451 } 7452 } 7453 7454 if (p0 >= pri && (smp[n0].sm_lead_w[0] & 0xff) 7455 == (c0 & 0xff)) 7456 continue; 7457 } 7458 7459 /* replace string */ 7460 ws = smp[n].sm_to_w; 7461 s = smp[n].sm_rules; 7462 p0 = (vim_strchr(s, '<') != NULL) ? 1 : 0; 7463 if (p0 == 1 && z == 0) 7464 { 7465 /* rule with '<' is used */ 7466 if (reslen > 0 && ws != NULL && *ws != NUL 7467 && (wres[reslen - 1] == c 7468 || wres[reslen - 1] == *ws)) 7469 reslen--; 7470 z0 = 1; 7471 z = 1; 7472 k0 = 0; 7473 if (ws != NULL) 7474 while (*ws != NUL && word[i + k0] != NUL) 7475 { 7476 word[i + k0] = *ws; 7477 k0++; 7478 ws++; 7479 } 7480 if (k > k0) 7481 mch_memmove(word + i + k0, word + i + k, 7482 sizeof(int) * (wordlen - (i + k) + 1)); 7483 7484 /* new "actual letter" */ 7485 c = word[i]; 7486 } 7487 else 7488 { 7489 /* no '<' rule used */ 7490 i += k - 1; 7491 z = 0; 7492 if (ws != NULL) 7493 while (*ws != NUL && ws[1] != NUL 7494 && reslen < MAXWLEN) 7495 { 7496 if (reslen == 0 || wres[reslen - 1] != *ws) 7497 wres[reslen++] = *ws; 7498 ws++; 7499 } 7500 /* new "actual letter" */ 7501 if (ws == NULL) 7502 c = NUL; 7503 else 7504 c = *ws; 7505 if (strstr((char *)s, "^^") != NULL) 7506 { 7507 if (c != NUL) 7508 wres[reslen++] = c; 7509 mch_memmove(word, word + i + 1, 7510 sizeof(int) * (wordlen - (i + 1) + 1)); 7511 i = 0; 7512 z0 = 1; 7513 } 7514 } 7515 break; 7516 } 7517 } 7518 } 7519 else if (VIM_ISWHITE(c)) 7520 { 7521 c = ' '; 7522 k = 1; 7523 } 7524 7525 if (z0 == 0) 7526 { 7527 if (k && !p0 && reslen < MAXWLEN && c != NUL 7528 && (!slang->sl_collapse || reslen == 0 7529 || wres[reslen - 1] != c)) 7530 /* condense only double letters */ 7531 wres[reslen++] = c; 7532 7533 i++; 7534 z = 0; 7535 k = 0; 7536 } 7537 } 7538 7539 /* Convert wide characters in "wres" to a multi-byte string in "res". */ 7540 l = 0; 7541 for (n = 0; n < reslen; ++n) 7542 { 7543 l += mb_char2bytes(wres[n], res + l); 7544 if (l + MB_MAXBYTES > MAXWLEN) 7545 break; 7546 } 7547 res[l] = NUL; 7548 } 7549 7550 /* 7551 * Compute a score for two sound-a-like words. 7552 * This permits up to two inserts/deletes/swaps/etc. to keep things fast. 7553 * Instead of a generic loop we write out the code. That keeps it fast by 7554 * avoiding checks that will not be possible. 7555 */ 7556 static int 7557 soundalike_score( 7558 char_u *goodstart, /* sound-folded good word */ 7559 char_u *badstart) /* sound-folded bad word */ 7560 { 7561 char_u *goodsound = goodstart; 7562 char_u *badsound = badstart; 7563 int goodlen; 7564 int badlen; 7565 int n; 7566 char_u *pl, *ps; 7567 char_u *pl2, *ps2; 7568 int score = 0; 7569 7570 /* Adding/inserting "*" at the start (word starts with vowel) shouldn't be 7571 * counted so much, vowels halfway the word aren't counted at all. */ 7572 if ((*badsound == '*' || *goodsound == '*') && *badsound != *goodsound) 7573 { 7574 if ((badsound[0] == NUL && goodsound[1] == NUL) 7575 || (goodsound[0] == NUL && badsound[1] == NUL)) 7576 /* changing word with vowel to word without a sound */ 7577 return SCORE_DEL; 7578 if (badsound[0] == NUL || goodsound[0] == NUL) 7579 /* more than two changes */ 7580 return SCORE_MAXMAX; 7581 7582 if (badsound[1] == goodsound[1] 7583 || (badsound[1] != NUL 7584 && goodsound[1] != NUL 7585 && badsound[2] == goodsound[2])) 7586 { 7587 /* handle like a substitute */ 7588 } 7589 else 7590 { 7591 score = 2 * SCORE_DEL / 3; 7592 if (*badsound == '*') 7593 ++badsound; 7594 else 7595 ++goodsound; 7596 } 7597 } 7598 7599 goodlen = (int)STRLEN(goodsound); 7600 badlen = (int)STRLEN(badsound); 7601 7602 /* Return quickly if the lengths are too different to be fixed by two 7603 * changes. */ 7604 n = goodlen - badlen; 7605 if (n < -2 || n > 2) 7606 return SCORE_MAXMAX; 7607 7608 if (n > 0) 7609 { 7610 pl = goodsound; /* goodsound is longest */ 7611 ps = badsound; 7612 } 7613 else 7614 { 7615 pl = badsound; /* badsound is longest */ 7616 ps = goodsound; 7617 } 7618 7619 /* Skip over the identical part. */ 7620 while (*pl == *ps && *pl != NUL) 7621 { 7622 ++pl; 7623 ++ps; 7624 } 7625 7626 switch (n) 7627 { 7628 case -2: 7629 case 2: 7630 /* 7631 * Must delete two characters from "pl". 7632 */ 7633 ++pl; /* first delete */ 7634 while (*pl == *ps) 7635 { 7636 ++pl; 7637 ++ps; 7638 } 7639 /* strings must be equal after second delete */ 7640 if (STRCMP(pl + 1, ps) == 0) 7641 return score + SCORE_DEL * 2; 7642 7643 /* Failed to compare. */ 7644 break; 7645 7646 case -1: 7647 case 1: 7648 /* 7649 * Minimal one delete from "pl" required. 7650 */ 7651 7652 /* 1: delete */ 7653 pl2 = pl + 1; 7654 ps2 = ps; 7655 while (*pl2 == *ps2) 7656 { 7657 if (*pl2 == NUL) /* reached the end */ 7658 return score + SCORE_DEL; 7659 ++pl2; 7660 ++ps2; 7661 } 7662 7663 /* 2: delete then swap, then rest must be equal */ 7664 if (pl2[0] == ps2[1] && pl2[1] == ps2[0] 7665 && STRCMP(pl2 + 2, ps2 + 2) == 0) 7666 return score + SCORE_DEL + SCORE_SWAP; 7667 7668 /* 3: delete then substitute, then the rest must be equal */ 7669 if (STRCMP(pl2 + 1, ps2 + 1) == 0) 7670 return score + SCORE_DEL + SCORE_SUBST; 7671 7672 /* 4: first swap then delete */ 7673 if (pl[0] == ps[1] && pl[1] == ps[0]) 7674 { 7675 pl2 = pl + 2; /* swap, skip two chars */ 7676 ps2 = ps + 2; 7677 while (*pl2 == *ps2) 7678 { 7679 ++pl2; 7680 ++ps2; 7681 } 7682 /* delete a char and then strings must be equal */ 7683 if (STRCMP(pl2 + 1, ps2) == 0) 7684 return score + SCORE_SWAP + SCORE_DEL; 7685 } 7686 7687 /* 5: first substitute then delete */ 7688 pl2 = pl + 1; /* substitute, skip one char */ 7689 ps2 = ps + 1; 7690 while (*pl2 == *ps2) 7691 { 7692 ++pl2; 7693 ++ps2; 7694 } 7695 /* delete a char and then strings must be equal */ 7696 if (STRCMP(pl2 + 1, ps2) == 0) 7697 return score + SCORE_SUBST + SCORE_DEL; 7698 7699 /* Failed to compare. */ 7700 break; 7701 7702 case 0: 7703 /* 7704 * Lengths are equal, thus changes must result in same length: An 7705 * insert is only possible in combination with a delete. 7706 * 1: check if for identical strings 7707 */ 7708 if (*pl == NUL) 7709 return score; 7710 7711 /* 2: swap */ 7712 if (pl[0] == ps[1] && pl[1] == ps[0]) 7713 { 7714 pl2 = pl + 2; /* swap, skip two chars */ 7715 ps2 = ps + 2; 7716 while (*pl2 == *ps2) 7717 { 7718 if (*pl2 == NUL) /* reached the end */ 7719 return score + SCORE_SWAP; 7720 ++pl2; 7721 ++ps2; 7722 } 7723 /* 3: swap and swap again */ 7724 if (pl2[0] == ps2[1] && pl2[1] == ps2[0] 7725 && STRCMP(pl2 + 2, ps2 + 2) == 0) 7726 return score + SCORE_SWAP + SCORE_SWAP; 7727 7728 /* 4: swap and substitute */ 7729 if (STRCMP(pl2 + 1, ps2 + 1) == 0) 7730 return score + SCORE_SWAP + SCORE_SUBST; 7731 } 7732 7733 /* 5: substitute */ 7734 pl2 = pl + 1; 7735 ps2 = ps + 1; 7736 while (*pl2 == *ps2) 7737 { 7738 if (*pl2 == NUL) /* reached the end */ 7739 return score + SCORE_SUBST; 7740 ++pl2; 7741 ++ps2; 7742 } 7743 7744 /* 6: substitute and swap */ 7745 if (pl2[0] == ps2[1] && pl2[1] == ps2[0] 7746 && STRCMP(pl2 + 2, ps2 + 2) == 0) 7747 return score + SCORE_SUBST + SCORE_SWAP; 7748 7749 /* 7: substitute and substitute */ 7750 if (STRCMP(pl2 + 1, ps2 + 1) == 0) 7751 return score + SCORE_SUBST + SCORE_SUBST; 7752 7753 /* 8: insert then delete */ 7754 pl2 = pl; 7755 ps2 = ps + 1; 7756 while (*pl2 == *ps2) 7757 { 7758 ++pl2; 7759 ++ps2; 7760 } 7761 if (STRCMP(pl2 + 1, ps2) == 0) 7762 return score + SCORE_INS + SCORE_DEL; 7763 7764 /* 9: delete then insert */ 7765 pl2 = pl + 1; 7766 ps2 = ps; 7767 while (*pl2 == *ps2) 7768 { 7769 ++pl2; 7770 ++ps2; 7771 } 7772 if (STRCMP(pl2, ps2 + 1) == 0) 7773 return score + SCORE_INS + SCORE_DEL; 7774 7775 /* Failed to compare. */ 7776 break; 7777 } 7778 7779 return SCORE_MAXMAX; 7780 } 7781 7782 /* 7783 * Compute the "edit distance" to turn "badword" into "goodword". The less 7784 * deletes/inserts/substitutes/swaps are required the lower the score. 7785 * 7786 * The algorithm is described by Du and Chang, 1992. 7787 * The implementation of the algorithm comes from Aspell editdist.cpp, 7788 * edit_distance(). It has been converted from C++ to C and modified to 7789 * support multi-byte characters. 7790 */ 7791 static int 7792 spell_edit_score( 7793 slang_T *slang, 7794 char_u *badword, 7795 char_u *goodword) 7796 { 7797 int *cnt; 7798 int badlen, goodlen; /* lengths including NUL */ 7799 int j, i; 7800 int t; 7801 int bc, gc; 7802 int pbc, pgc; 7803 char_u *p; 7804 int wbadword[MAXWLEN]; 7805 int wgoodword[MAXWLEN]; 7806 7807 if (has_mbyte) 7808 { 7809 /* Get the characters from the multi-byte strings and put them in an 7810 * int array for easy access. */ 7811 for (p = badword, badlen = 0; *p != NUL; ) 7812 wbadword[badlen++] = mb_cptr2char_adv(&p); 7813 wbadword[badlen++] = 0; 7814 for (p = goodword, goodlen = 0; *p != NUL; ) 7815 wgoodword[goodlen++] = mb_cptr2char_adv(&p); 7816 wgoodword[goodlen++] = 0; 7817 } 7818 else 7819 { 7820 badlen = (int)STRLEN(badword) + 1; 7821 goodlen = (int)STRLEN(goodword) + 1; 7822 } 7823 7824 /* We use "cnt" as an array: CNT(badword_idx, goodword_idx). */ 7825 #define CNT(a, b) cnt[(a) + (b) * (badlen + 1)] 7826 cnt = (int *)lalloc((long_u)(sizeof(int) * (badlen + 1) * (goodlen + 1)), 7827 TRUE); 7828 if (cnt == NULL) 7829 return 0; /* out of memory */ 7830 7831 CNT(0, 0) = 0; 7832 for (j = 1; j <= goodlen; ++j) 7833 CNT(0, j) = CNT(0, j - 1) + SCORE_INS; 7834 7835 for (i = 1; i <= badlen; ++i) 7836 { 7837 CNT(i, 0) = CNT(i - 1, 0) + SCORE_DEL; 7838 for (j = 1; j <= goodlen; ++j) 7839 { 7840 if (has_mbyte) 7841 { 7842 bc = wbadword[i - 1]; 7843 gc = wgoodword[j - 1]; 7844 } 7845 else 7846 { 7847 bc = badword[i - 1]; 7848 gc = goodword[j - 1]; 7849 } 7850 if (bc == gc) 7851 CNT(i, j) = CNT(i - 1, j - 1); 7852 else 7853 { 7854 /* Use a better score when there is only a case difference. */ 7855 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc)) 7856 CNT(i, j) = SCORE_ICASE + CNT(i - 1, j - 1); 7857 else 7858 { 7859 /* For a similar character use SCORE_SIMILAR. */ 7860 if (slang != NULL 7861 && slang->sl_has_map 7862 && similar_chars(slang, gc, bc)) 7863 CNT(i, j) = SCORE_SIMILAR + CNT(i - 1, j - 1); 7864 else 7865 CNT(i, j) = SCORE_SUBST + CNT(i - 1, j - 1); 7866 } 7867 7868 if (i > 1 && j > 1) 7869 { 7870 if (has_mbyte) 7871 { 7872 pbc = wbadword[i - 2]; 7873 pgc = wgoodword[j - 2]; 7874 } 7875 else 7876 { 7877 pbc = badword[i - 2]; 7878 pgc = goodword[j - 2]; 7879 } 7880 if (bc == pgc && pbc == gc) 7881 { 7882 t = SCORE_SWAP + CNT(i - 2, j - 2); 7883 if (t < CNT(i, j)) 7884 CNT(i, j) = t; 7885 } 7886 } 7887 t = SCORE_DEL + CNT(i - 1, j); 7888 if (t < CNT(i, j)) 7889 CNT(i, j) = t; 7890 t = SCORE_INS + CNT(i, j - 1); 7891 if (t < CNT(i, j)) 7892 CNT(i, j) = t; 7893 } 7894 } 7895 } 7896 7897 i = CNT(badlen - 1, goodlen - 1); 7898 vim_free(cnt); 7899 return i; 7900 } 7901 7902 typedef struct 7903 { 7904 int badi; 7905 int goodi; 7906 int score; 7907 } limitscore_T; 7908 7909 /* 7910 * Like spell_edit_score(), but with a limit on the score to make it faster. 7911 * May return SCORE_MAXMAX when the score is higher than "limit". 7912 * 7913 * This uses a stack for the edits still to be tried. 7914 * The idea comes from Aspell leditdist.cpp. Rewritten in C and added support 7915 * for multi-byte characters. 7916 */ 7917 static int 7918 spell_edit_score_limit( 7919 slang_T *slang, 7920 char_u *badword, 7921 char_u *goodword, 7922 int limit) 7923 { 7924 limitscore_T stack[10]; /* allow for over 3 * 2 edits */ 7925 int stackidx; 7926 int bi, gi; 7927 int bi2, gi2; 7928 int bc, gc; 7929 int score; 7930 int score_off; 7931 int minscore; 7932 int round; 7933 7934 /* Multi-byte characters require a bit more work, use a different function 7935 * to avoid testing "has_mbyte" quite often. */ 7936 if (has_mbyte) 7937 return spell_edit_score_limit_w(slang, badword, goodword, limit); 7938 7939 /* 7940 * The idea is to go from start to end over the words. So long as 7941 * characters are equal just continue, this always gives the lowest score. 7942 * When there is a difference try several alternatives. Each alternative 7943 * increases "score" for the edit distance. Some of the alternatives are 7944 * pushed unto a stack and tried later, some are tried right away. At the 7945 * end of the word the score for one alternative is known. The lowest 7946 * possible score is stored in "minscore". 7947 */ 7948 stackidx = 0; 7949 bi = 0; 7950 gi = 0; 7951 score = 0; 7952 minscore = limit + 1; 7953 7954 for (;;) 7955 { 7956 /* Skip over an equal part, score remains the same. */ 7957 for (;;) 7958 { 7959 bc = badword[bi]; 7960 gc = goodword[gi]; 7961 if (bc != gc) /* stop at a char that's different */ 7962 break; 7963 if (bc == NUL) /* both words end */ 7964 { 7965 if (score < minscore) 7966 minscore = score; 7967 goto pop; /* do next alternative */ 7968 } 7969 ++bi; 7970 ++gi; 7971 } 7972 7973 if (gc == NUL) /* goodword ends, delete badword chars */ 7974 { 7975 do 7976 { 7977 if ((score += SCORE_DEL) >= minscore) 7978 goto pop; /* do next alternative */ 7979 } while (badword[++bi] != NUL); 7980 minscore = score; 7981 } 7982 else if (bc == NUL) /* badword ends, insert badword chars */ 7983 { 7984 do 7985 { 7986 if ((score += SCORE_INS) >= minscore) 7987 goto pop; /* do next alternative */ 7988 } while (goodword[++gi] != NUL); 7989 minscore = score; 7990 } 7991 else /* both words continue */ 7992 { 7993 /* If not close to the limit, perform a change. Only try changes 7994 * that may lead to a lower score than "minscore". 7995 * round 0: try deleting a char from badword 7996 * round 1: try inserting a char in badword */ 7997 for (round = 0; round <= 1; ++round) 7998 { 7999 score_off = score + (round == 0 ? SCORE_DEL : SCORE_INS); 8000 if (score_off < minscore) 8001 { 8002 if (score_off + SCORE_EDIT_MIN >= minscore) 8003 { 8004 /* Near the limit, rest of the words must match. We 8005 * can check that right now, no need to push an item 8006 * onto the stack. */ 8007 bi2 = bi + 1 - round; 8008 gi2 = gi + round; 8009 while (goodword[gi2] == badword[bi2]) 8010 { 8011 if (goodword[gi2] == NUL) 8012 { 8013 minscore = score_off; 8014 break; 8015 } 8016 ++bi2; 8017 ++gi2; 8018 } 8019 } 8020 else 8021 { 8022 /* try deleting/inserting a character later */ 8023 stack[stackidx].badi = bi + 1 - round; 8024 stack[stackidx].goodi = gi + round; 8025 stack[stackidx].score = score_off; 8026 ++stackidx; 8027 } 8028 } 8029 } 8030 8031 if (score + SCORE_SWAP < minscore) 8032 { 8033 /* If swapping two characters makes a match then the 8034 * substitution is more expensive, thus there is no need to 8035 * try both. */ 8036 if (gc == badword[bi + 1] && bc == goodword[gi + 1]) 8037 { 8038 /* Swap two characters, that is: skip them. */ 8039 gi += 2; 8040 bi += 2; 8041 score += SCORE_SWAP; 8042 continue; 8043 } 8044 } 8045 8046 /* Substitute one character for another which is the same 8047 * thing as deleting a character from both goodword and badword. 8048 * Use a better score when there is only a case difference. */ 8049 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc)) 8050 score += SCORE_ICASE; 8051 else 8052 { 8053 /* For a similar character use SCORE_SIMILAR. */ 8054 if (slang != NULL 8055 && slang->sl_has_map 8056 && similar_chars(slang, gc, bc)) 8057 score += SCORE_SIMILAR; 8058 else 8059 score += SCORE_SUBST; 8060 } 8061 8062 if (score < minscore) 8063 { 8064 /* Do the substitution. */ 8065 ++gi; 8066 ++bi; 8067 continue; 8068 } 8069 } 8070 pop: 8071 /* 8072 * Get here to try the next alternative, pop it from the stack. 8073 */ 8074 if (stackidx == 0) /* stack is empty, finished */ 8075 break; 8076 8077 /* pop an item from the stack */ 8078 --stackidx; 8079 gi = stack[stackidx].goodi; 8080 bi = stack[stackidx].badi; 8081 score = stack[stackidx].score; 8082 } 8083 8084 /* When the score goes over "limit" it may actually be much higher. 8085 * Return a very large number to avoid going below the limit when giving a 8086 * bonus. */ 8087 if (minscore > limit) 8088 return SCORE_MAXMAX; 8089 return minscore; 8090 } 8091 8092 /* 8093 * Multi-byte version of spell_edit_score_limit(). 8094 * Keep it in sync with the above! 8095 */ 8096 static int 8097 spell_edit_score_limit_w( 8098 slang_T *slang, 8099 char_u *badword, 8100 char_u *goodword, 8101 int limit) 8102 { 8103 limitscore_T stack[10]; /* allow for over 3 * 2 edits */ 8104 int stackidx; 8105 int bi, gi; 8106 int bi2, gi2; 8107 int bc, gc; 8108 int score; 8109 int score_off; 8110 int minscore; 8111 int round; 8112 char_u *p; 8113 int wbadword[MAXWLEN]; 8114 int wgoodword[MAXWLEN]; 8115 8116 /* Get the characters from the multi-byte strings and put them in an 8117 * int array for easy access. */ 8118 bi = 0; 8119 for (p = badword; *p != NUL; ) 8120 wbadword[bi++] = mb_cptr2char_adv(&p); 8121 wbadword[bi++] = 0; 8122 gi = 0; 8123 for (p = goodword; *p != NUL; ) 8124 wgoodword[gi++] = mb_cptr2char_adv(&p); 8125 wgoodword[gi++] = 0; 8126 8127 /* 8128 * The idea is to go from start to end over the words. So long as 8129 * characters are equal just continue, this always gives the lowest score. 8130 * When there is a difference try several alternatives. Each alternative 8131 * increases "score" for the edit distance. Some of the alternatives are 8132 * pushed unto a stack and tried later, some are tried right away. At the 8133 * end of the word the score for one alternative is known. The lowest 8134 * possible score is stored in "minscore". 8135 */ 8136 stackidx = 0; 8137 bi = 0; 8138 gi = 0; 8139 score = 0; 8140 minscore = limit + 1; 8141 8142 for (;;) 8143 { 8144 /* Skip over an equal part, score remains the same. */ 8145 for (;;) 8146 { 8147 bc = wbadword[bi]; 8148 gc = wgoodword[gi]; 8149 8150 if (bc != gc) /* stop at a char that's different */ 8151 break; 8152 if (bc == NUL) /* both words end */ 8153 { 8154 if (score < minscore) 8155 minscore = score; 8156 goto pop; /* do next alternative */ 8157 } 8158 ++bi; 8159 ++gi; 8160 } 8161 8162 if (gc == NUL) /* goodword ends, delete badword chars */ 8163 { 8164 do 8165 { 8166 if ((score += SCORE_DEL) >= minscore) 8167 goto pop; /* do next alternative */ 8168 } while (wbadword[++bi] != NUL); 8169 minscore = score; 8170 } 8171 else if (bc == NUL) /* badword ends, insert badword chars */ 8172 { 8173 do 8174 { 8175 if ((score += SCORE_INS) >= minscore) 8176 goto pop; /* do next alternative */ 8177 } while (wgoodword[++gi] != NUL); 8178 minscore = score; 8179 } 8180 else /* both words continue */ 8181 { 8182 /* If not close to the limit, perform a change. Only try changes 8183 * that may lead to a lower score than "minscore". 8184 * round 0: try deleting a char from badword 8185 * round 1: try inserting a char in badword */ 8186 for (round = 0; round <= 1; ++round) 8187 { 8188 score_off = score + (round == 0 ? SCORE_DEL : SCORE_INS); 8189 if (score_off < minscore) 8190 { 8191 if (score_off + SCORE_EDIT_MIN >= minscore) 8192 { 8193 /* Near the limit, rest of the words must match. We 8194 * can check that right now, no need to push an item 8195 * onto the stack. */ 8196 bi2 = bi + 1 - round; 8197 gi2 = gi + round; 8198 while (wgoodword[gi2] == wbadword[bi2]) 8199 { 8200 if (wgoodword[gi2] == NUL) 8201 { 8202 minscore = score_off; 8203 break; 8204 } 8205 ++bi2; 8206 ++gi2; 8207 } 8208 } 8209 else 8210 { 8211 /* try deleting a character from badword later */ 8212 stack[stackidx].badi = bi + 1 - round; 8213 stack[stackidx].goodi = gi + round; 8214 stack[stackidx].score = score_off; 8215 ++stackidx; 8216 } 8217 } 8218 } 8219 8220 if (score + SCORE_SWAP < minscore) 8221 { 8222 /* If swapping two characters makes a match then the 8223 * substitution is more expensive, thus there is no need to 8224 * try both. */ 8225 if (gc == wbadword[bi + 1] && bc == wgoodword[gi + 1]) 8226 { 8227 /* Swap two characters, that is: skip them. */ 8228 gi += 2; 8229 bi += 2; 8230 score += SCORE_SWAP; 8231 continue; 8232 } 8233 } 8234 8235 /* Substitute one character for another which is the same 8236 * thing as deleting a character from both goodword and badword. 8237 * Use a better score when there is only a case difference. */ 8238 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc)) 8239 score += SCORE_ICASE; 8240 else 8241 { 8242 /* For a similar character use SCORE_SIMILAR. */ 8243 if (slang != NULL 8244 && slang->sl_has_map 8245 && similar_chars(slang, gc, bc)) 8246 score += SCORE_SIMILAR; 8247 else 8248 score += SCORE_SUBST; 8249 } 8250 8251 if (score < minscore) 8252 { 8253 /* Do the substitution. */ 8254 ++gi; 8255 ++bi; 8256 continue; 8257 } 8258 } 8259 pop: 8260 /* 8261 * Get here to try the next alternative, pop it from the stack. 8262 */ 8263 if (stackidx == 0) /* stack is empty, finished */ 8264 break; 8265 8266 /* pop an item from the stack */ 8267 --stackidx; 8268 gi = stack[stackidx].goodi; 8269 bi = stack[stackidx].badi; 8270 score = stack[stackidx].score; 8271 } 8272 8273 /* When the score goes over "limit" it may actually be much higher. 8274 * Return a very large number to avoid going below the limit when giving a 8275 * bonus. */ 8276 if (minscore > limit) 8277 return SCORE_MAXMAX; 8278 return minscore; 8279 } 8280 8281 /* 8282 * ":spellinfo" 8283 */ 8284 void 8285 ex_spellinfo(exarg_T *eap UNUSED) 8286 { 8287 int lpi; 8288 langp_T *lp; 8289 char_u *p; 8290 8291 if (no_spell_checking(curwin)) 8292 return; 8293 8294 msg_start(); 8295 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len && !got_int; ++lpi) 8296 { 8297 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 8298 msg_puts("file: "); 8299 msg_puts((char *)lp->lp_slang->sl_fname); 8300 msg_putchar('\n'); 8301 p = lp->lp_slang->sl_info; 8302 if (p != NULL) 8303 { 8304 msg_puts((char *)p); 8305 msg_putchar('\n'); 8306 } 8307 } 8308 msg_end(); 8309 } 8310 8311 #define DUMPFLAG_KEEPCASE 1 /* round 2: keep-case tree */ 8312 #define DUMPFLAG_COUNT 2 /* include word count */ 8313 #define DUMPFLAG_ICASE 4 /* ignore case when finding matches */ 8314 #define DUMPFLAG_ONECAP 8 /* pattern starts with capital */ 8315 #define DUMPFLAG_ALLCAP 16 /* pattern is all capitals */ 8316 8317 /* 8318 * ":spelldump" 8319 */ 8320 void 8321 ex_spelldump(exarg_T *eap) 8322 { 8323 char_u *spl; 8324 long dummy; 8325 8326 if (no_spell_checking(curwin)) 8327 return; 8328 get_option_value((char_u*)"spl", &dummy, &spl, OPT_LOCAL); 8329 8330 /* Create a new empty buffer in a new window. */ 8331 do_cmdline_cmd((char_u *)"new"); 8332 8333 /* enable spelling locally in the new window */ 8334 set_option_value((char_u*)"spell", TRUE, (char_u*)"", OPT_LOCAL); 8335 set_option_value((char_u*)"spl", dummy, spl, OPT_LOCAL); 8336 vim_free(spl); 8337 8338 if (!BUFEMPTY()) 8339 return; 8340 8341 spell_dump_compl(NULL, 0, NULL, eap->forceit ? DUMPFLAG_COUNT : 0); 8342 8343 /* Delete the empty line that we started with. */ 8344 if (curbuf->b_ml.ml_line_count > 1) 8345 ml_delete(curbuf->b_ml.ml_line_count, FALSE); 8346 8347 redraw_later(NOT_VALID); 8348 } 8349 8350 /* 8351 * Go through all possible words and: 8352 * 1. When "pat" is NULL: dump a list of all words in the current buffer. 8353 * "ic" and "dir" are not used. 8354 * 2. When "pat" is not NULL: add matching words to insert mode completion. 8355 */ 8356 void 8357 spell_dump_compl( 8358 char_u *pat, /* leading part of the word */ 8359 int ic, /* ignore case */ 8360 int *dir, /* direction for adding matches */ 8361 int dumpflags_arg) /* DUMPFLAG_* */ 8362 { 8363 langp_T *lp; 8364 slang_T *slang; 8365 idx_T arridx[MAXWLEN]; 8366 int curi[MAXWLEN]; 8367 char_u word[MAXWLEN]; 8368 int c; 8369 char_u *byts; 8370 idx_T *idxs; 8371 linenr_T lnum = 0; 8372 int round; 8373 int depth; 8374 int n; 8375 int flags; 8376 char_u *region_names = NULL; /* region names being used */ 8377 int do_region = TRUE; /* dump region names and numbers */ 8378 char_u *p; 8379 int lpi; 8380 int dumpflags = dumpflags_arg; 8381 int patlen; 8382 8383 /* When ignoring case or when the pattern starts with capital pass this on 8384 * to dump_word(). */ 8385 if (pat != NULL) 8386 { 8387 if (ic) 8388 dumpflags |= DUMPFLAG_ICASE; 8389 else 8390 { 8391 n = captype(pat, NULL); 8392 if (n == WF_ONECAP) 8393 dumpflags |= DUMPFLAG_ONECAP; 8394 else if (n == WF_ALLCAP && (int)STRLEN(pat) > mb_ptr2len(pat)) 8395 dumpflags |= DUMPFLAG_ALLCAP; 8396 } 8397 } 8398 8399 /* Find out if we can support regions: All languages must support the same 8400 * regions or none at all. */ 8401 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 8402 { 8403 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 8404 p = lp->lp_slang->sl_regions; 8405 if (p[0] != 0) 8406 { 8407 if (region_names == NULL) /* first language with regions */ 8408 region_names = p; 8409 else if (STRCMP(region_names, p) != 0) 8410 { 8411 do_region = FALSE; /* region names are different */ 8412 break; 8413 } 8414 } 8415 } 8416 8417 if (do_region && region_names != NULL) 8418 { 8419 if (pat == NULL) 8420 { 8421 vim_snprintf((char *)IObuff, IOSIZE, "/regions=%s", region_names); 8422 ml_append(lnum++, IObuff, (colnr_T)0, FALSE); 8423 } 8424 } 8425 else 8426 do_region = FALSE; 8427 8428 /* 8429 * Loop over all files loaded for the entries in 'spelllang'. 8430 */ 8431 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) 8432 { 8433 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 8434 slang = lp->lp_slang; 8435 if (slang->sl_fbyts == NULL) /* reloading failed */ 8436 continue; 8437 8438 if (pat == NULL) 8439 { 8440 vim_snprintf((char *)IObuff, IOSIZE, "# file: %s", slang->sl_fname); 8441 ml_append(lnum++, IObuff, (colnr_T)0, FALSE); 8442 } 8443 8444 /* When matching with a pattern and there are no prefixes only use 8445 * parts of the tree that match "pat". */ 8446 if (pat != NULL && slang->sl_pbyts == NULL) 8447 patlen = (int)STRLEN(pat); 8448 else 8449 patlen = -1; 8450 8451 /* round 1: case-folded tree 8452 * round 2: keep-case tree */ 8453 for (round = 1; round <= 2; ++round) 8454 { 8455 if (round == 1) 8456 { 8457 dumpflags &= ~DUMPFLAG_KEEPCASE; 8458 byts = slang->sl_fbyts; 8459 idxs = slang->sl_fidxs; 8460 } 8461 else 8462 { 8463 dumpflags |= DUMPFLAG_KEEPCASE; 8464 byts = slang->sl_kbyts; 8465 idxs = slang->sl_kidxs; 8466 } 8467 if (byts == NULL) 8468 continue; /* array is empty */ 8469 8470 depth = 0; 8471 arridx[0] = 0; 8472 curi[0] = 1; 8473 while (depth >= 0 && !got_int 8474 && (pat == NULL || !ins_compl_interrupted())) 8475 { 8476 if (curi[depth] > byts[arridx[depth]]) 8477 { 8478 /* Done all bytes at this node, go up one level. */ 8479 --depth; 8480 line_breakcheck(); 8481 ins_compl_check_keys(50, FALSE); 8482 } 8483 else 8484 { 8485 /* Do one more byte at this node. */ 8486 n = arridx[depth] + curi[depth]; 8487 ++curi[depth]; 8488 c = byts[n]; 8489 if (c == 0) 8490 { 8491 /* End of word, deal with the word. 8492 * Don't use keep-case words in the fold-case tree, 8493 * they will appear in the keep-case tree. 8494 * Only use the word when the region matches. */ 8495 flags = (int)idxs[n]; 8496 if ((round == 2 || (flags & WF_KEEPCAP) == 0) 8497 && (flags & WF_NEEDCOMP) == 0 8498 && (do_region 8499 || (flags & WF_REGION) == 0 8500 || (((unsigned)flags >> 16) 8501 & lp->lp_region) != 0)) 8502 { 8503 word[depth] = NUL; 8504 if (!do_region) 8505 flags &= ~WF_REGION; 8506 8507 /* Dump the basic word if there is no prefix or 8508 * when it's the first one. */ 8509 c = (unsigned)flags >> 24; 8510 if (c == 0 || curi[depth] == 2) 8511 { 8512 dump_word(slang, word, pat, dir, 8513 dumpflags, flags, lnum); 8514 if (pat == NULL) 8515 ++lnum; 8516 } 8517 8518 /* Apply the prefix, if there is one. */ 8519 if (c != 0) 8520 lnum = dump_prefixes(slang, word, pat, dir, 8521 dumpflags, flags, lnum); 8522 } 8523 } 8524 else 8525 { 8526 /* Normal char, go one level deeper. */ 8527 word[depth++] = c; 8528 arridx[depth] = idxs[n]; 8529 curi[depth] = 1; 8530 8531 /* Check if this characters matches with the pattern. 8532 * If not skip the whole tree below it. 8533 * Always ignore case here, dump_word() will check 8534 * proper case later. This isn't exactly right when 8535 * length changes for multi-byte characters with 8536 * ignore case... */ 8537 if (depth <= patlen 8538 && MB_STRNICMP(word, pat, depth) != 0) 8539 --depth; 8540 } 8541 } 8542 } 8543 } 8544 } 8545 } 8546 8547 /* 8548 * Dump one word: apply case modifications and append a line to the buffer. 8549 * When "lnum" is zero add insert mode completion. 8550 */ 8551 static void 8552 dump_word( 8553 slang_T *slang, 8554 char_u *word, 8555 char_u *pat, 8556 int *dir, 8557 int dumpflags, 8558 int wordflags, 8559 linenr_T lnum) 8560 { 8561 int keepcap = FALSE; 8562 char_u *p; 8563 char_u *tw; 8564 char_u cword[MAXWLEN]; 8565 char_u badword[MAXWLEN + 10]; 8566 int i; 8567 int flags = wordflags; 8568 8569 if (dumpflags & DUMPFLAG_ONECAP) 8570 flags |= WF_ONECAP; 8571 if (dumpflags & DUMPFLAG_ALLCAP) 8572 flags |= WF_ALLCAP; 8573 8574 if ((dumpflags & DUMPFLAG_KEEPCASE) == 0 && (flags & WF_CAPMASK) != 0) 8575 { 8576 /* Need to fix case according to "flags". */ 8577 make_case_word(word, cword, flags); 8578 p = cword; 8579 } 8580 else 8581 { 8582 p = word; 8583 if ((dumpflags & DUMPFLAG_KEEPCASE) 8584 && ((captype(word, NULL) & WF_KEEPCAP) == 0 8585 || (flags & WF_FIXCAP) != 0)) 8586 keepcap = TRUE; 8587 } 8588 tw = p; 8589 8590 if (pat == NULL) 8591 { 8592 /* Add flags and regions after a slash. */ 8593 if ((flags & (WF_BANNED | WF_RARE | WF_REGION)) || keepcap) 8594 { 8595 STRCPY(badword, p); 8596 STRCAT(badword, "/"); 8597 if (keepcap) 8598 STRCAT(badword, "="); 8599 if (flags & WF_BANNED) 8600 STRCAT(badword, "!"); 8601 else if (flags & WF_RARE) 8602 STRCAT(badword, "?"); 8603 if (flags & WF_REGION) 8604 for (i = 0; i < 7; ++i) 8605 if (flags & (0x10000 << i)) 8606 sprintf((char *)badword + STRLEN(badword), "%d", i + 1); 8607 p = badword; 8608 } 8609 8610 if (dumpflags & DUMPFLAG_COUNT) 8611 { 8612 hashitem_T *hi; 8613 8614 /* Include the word count for ":spelldump!". */ 8615 hi = hash_find(&slang->sl_wordcount, tw); 8616 if (!HASHITEM_EMPTY(hi)) 8617 { 8618 vim_snprintf((char *)IObuff, IOSIZE, "%s\t%d", 8619 tw, HI2WC(hi)->wc_count); 8620 p = IObuff; 8621 } 8622 } 8623 8624 ml_append(lnum, p, (colnr_T)0, FALSE); 8625 } 8626 else if (((dumpflags & DUMPFLAG_ICASE) 8627 ? MB_STRNICMP(p, pat, STRLEN(pat)) == 0 8628 : STRNCMP(p, pat, STRLEN(pat)) == 0) 8629 && ins_compl_add_infercase(p, (int)STRLEN(p), 8630 p_ic, NULL, *dir, 0) == OK) 8631 /* if dir was BACKWARD then honor it just once */ 8632 *dir = FORWARD; 8633 } 8634 8635 /* 8636 * For ":spelldump": Find matching prefixes for "word". Prepend each to 8637 * "word" and append a line to the buffer. 8638 * When "lnum" is zero add insert mode completion. 8639 * Return the updated line number. 8640 */ 8641 static linenr_T 8642 dump_prefixes( 8643 slang_T *slang, 8644 char_u *word, /* case-folded word */ 8645 char_u *pat, 8646 int *dir, 8647 int dumpflags, 8648 int flags, /* flags with prefix ID */ 8649 linenr_T startlnum) 8650 { 8651 idx_T arridx[MAXWLEN]; 8652 int curi[MAXWLEN]; 8653 char_u prefix[MAXWLEN]; 8654 char_u word_up[MAXWLEN]; 8655 int has_word_up = FALSE; 8656 int c; 8657 char_u *byts; 8658 idx_T *idxs; 8659 linenr_T lnum = startlnum; 8660 int depth; 8661 int n; 8662 int len; 8663 int i; 8664 8665 /* If the word starts with a lower-case letter make the word with an 8666 * upper-case letter in word_up[]. */ 8667 c = PTR2CHAR(word); 8668 if (SPELL_TOUPPER(c) != c) 8669 { 8670 onecap_copy(word, word_up, TRUE); 8671 has_word_up = TRUE; 8672 } 8673 8674 byts = slang->sl_pbyts; 8675 idxs = slang->sl_pidxs; 8676 if (byts != NULL) /* array not is empty */ 8677 { 8678 /* 8679 * Loop over all prefixes, building them byte-by-byte in prefix[]. 8680 * When at the end of a prefix check that it supports "flags". 8681 */ 8682 depth = 0; 8683 arridx[0] = 0; 8684 curi[0] = 1; 8685 while (depth >= 0 && !got_int) 8686 { 8687 n = arridx[depth]; 8688 len = byts[n]; 8689 if (curi[depth] > len) 8690 { 8691 /* Done all bytes at this node, go up one level. */ 8692 --depth; 8693 line_breakcheck(); 8694 } 8695 else 8696 { 8697 /* Do one more byte at this node. */ 8698 n += curi[depth]; 8699 ++curi[depth]; 8700 c = byts[n]; 8701 if (c == 0) 8702 { 8703 /* End of prefix, find out how many IDs there are. */ 8704 for (i = 1; i < len; ++i) 8705 if (byts[n + i] != 0) 8706 break; 8707 curi[depth] += i - 1; 8708 8709 c = valid_word_prefix(i, n, flags, word, slang, FALSE); 8710 if (c != 0) 8711 { 8712 vim_strncpy(prefix + depth, word, MAXWLEN - depth - 1); 8713 dump_word(slang, prefix, pat, dir, dumpflags, 8714 (c & WF_RAREPFX) ? (flags | WF_RARE) 8715 : flags, lnum); 8716 if (lnum != 0) 8717 ++lnum; 8718 } 8719 8720 /* Check for prefix that matches the word when the 8721 * first letter is upper-case, but only if the prefix has 8722 * a condition. */ 8723 if (has_word_up) 8724 { 8725 c = valid_word_prefix(i, n, flags, word_up, slang, 8726 TRUE); 8727 if (c != 0) 8728 { 8729 vim_strncpy(prefix + depth, word_up, 8730 MAXWLEN - depth - 1); 8731 dump_word(slang, prefix, pat, dir, dumpflags, 8732 (c & WF_RAREPFX) ? (flags | WF_RARE) 8733 : flags, lnum); 8734 if (lnum != 0) 8735 ++lnum; 8736 } 8737 } 8738 } 8739 else 8740 { 8741 /* Normal char, go one level deeper. */ 8742 prefix[depth++] = c; 8743 arridx[depth] = idxs[n]; 8744 curi[depth] = 1; 8745 } 8746 } 8747 } 8748 } 8749 8750 return lnum; 8751 } 8752 8753 /* 8754 * Move "p" to the end of word "start". 8755 * Uses the spell-checking word characters. 8756 */ 8757 char_u * 8758 spell_to_word_end(char_u *start, win_T *win) 8759 { 8760 char_u *p = start; 8761 8762 while (*p != NUL && spell_iswordp(p, win)) 8763 MB_PTR_ADV(p); 8764 return p; 8765 } 8766 8767 #if defined(FEAT_INS_EXPAND) || defined(PROTO) 8768 /* 8769 * For Insert mode completion CTRL-X s: 8770 * Find start of the word in front of column "startcol". 8771 * We don't check if it is badly spelled, with completion we can only change 8772 * the word in front of the cursor. 8773 * Returns the column number of the word. 8774 */ 8775 int 8776 spell_word_start(int startcol) 8777 { 8778 char_u *line; 8779 char_u *p; 8780 int col = 0; 8781 8782 if (no_spell_checking(curwin)) 8783 return startcol; 8784 8785 /* Find a word character before "startcol". */ 8786 line = ml_get_curline(); 8787 for (p = line + startcol; p > line; ) 8788 { 8789 MB_PTR_BACK(line, p); 8790 if (spell_iswordp_nmw(p, curwin)) 8791 break; 8792 } 8793 8794 /* Go back to start of the word. */ 8795 while (p > line) 8796 { 8797 col = (int)(p - line); 8798 MB_PTR_BACK(line, p); 8799 if (!spell_iswordp(p, curwin)) 8800 break; 8801 col = 0; 8802 } 8803 8804 return col; 8805 } 8806 8807 /* 8808 * Need to check for 'spellcapcheck' now, the word is removed before 8809 * expand_spelling() is called. Therefore the ugly global variable. 8810 */ 8811 static int spell_expand_need_cap; 8812 8813 void 8814 spell_expand_check_cap(colnr_T col) 8815 { 8816 spell_expand_need_cap = check_need_cap(curwin->w_cursor.lnum, col); 8817 } 8818 8819 /* 8820 * Get list of spelling suggestions. 8821 * Used for Insert mode completion CTRL-X ?. 8822 * Returns the number of matches. The matches are in "matchp[]", array of 8823 * allocated strings. 8824 */ 8825 int 8826 expand_spelling( 8827 linenr_T lnum UNUSED, 8828 char_u *pat, 8829 char_u ***matchp) 8830 { 8831 garray_T ga; 8832 8833 spell_suggest_list(&ga, pat, 100, spell_expand_need_cap, TRUE); 8834 *matchp = ga.ga_data; 8835 return ga.ga_len; 8836 } 8837 #endif 8838 8839 #endif /* FEAT_SPELL */ 8840