xref: /sqlite-3.40.0/ext/fts5/fts5_hash.c (revision f7ff7556)
1 /*
2 ** 2014 August 11
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 ******************************************************************************
12 **
13 */
14 
15 
16 
17 #include "fts5Int.h"
18 
19 typedef struct Fts5HashEntry Fts5HashEntry;
20 
21 /*
22 ** This file contains the implementation of an in-memory hash table used
23 ** to accumuluate "term -> doclist" content before it is flused to a level-0
24 ** segment.
25 */
26 
27 
28 struct Fts5Hash {
29   int eDetail;                    /* Copy of Fts5Config.eDetail */
30   int *pnByte;                    /* Pointer to bytes counter */
31   int nEntry;                     /* Number of entries currently in hash */
32   int nSlot;                      /* Size of aSlot[] array */
33   Fts5HashEntry *pScan;           /* Current ordered scan item */
34   Fts5HashEntry **aSlot;          /* Array of hash slots */
35 };
36 
37 /*
38 ** Each entry in the hash table is represented by an object of the
39 ** following type. Each object, its key (a nul-terminated string) and
40 ** its current data are stored in a single memory allocation. The
41 ** key immediately follows the object in memory. The position list
42 ** data immediately follows the key data in memory.
43 **
44 ** The data that follows the key is in a similar, but not identical format
45 ** to the doclist data stored in the database. It is:
46 **
47 **   * Rowid, as a varint
48 **   * Position list, without 0x00 terminator.
49 **   * Size of previous position list and rowid, as a 4 byte
50 **     big-endian integer.
51 **
52 ** iRowidOff:
53 **   Offset of last rowid written to data area. Relative to first byte of
54 **   structure.
55 **
56 ** nData:
57 **   Bytes of data written since iRowidOff.
58 */
59 struct Fts5HashEntry {
60   Fts5HashEntry *pHashNext;       /* Next hash entry with same hash-key */
61   Fts5HashEntry *pScanNext;       /* Next entry in sorted order */
62 
63   int nAlloc;                     /* Total size of allocation */
64   int iSzPoslist;                 /* Offset of space for 4-byte poslist size */
65   int nData;                      /* Total bytes of data (incl. structure) */
66   int nKey;                       /* Length of key in bytes */
67   u8 bDel;                        /* Set delete-flag @ iSzPoslist */
68   u8 bContent;                    /* Set content-flag (detail=none mode) */
69   i16 iCol;                       /* Column of last value written */
70   int iPos;                       /* Position of last value written */
71   i64 iRowid;                     /* Rowid of last value written */
72 };
73 
74 /*
75 ** Eqivalent to:
76 **
77 **   char *fts5EntryKey(Fts5HashEntry *pEntry){ return zKey; }
78 */
79 #define fts5EntryKey(p) ( ((char *)(&(p)[1])) )
80 
81 
82 /*
83 ** Allocate a new hash table.
84 */
sqlite3Fts5HashNew(Fts5Config * pConfig,Fts5Hash ** ppNew,int * pnByte)85 int sqlite3Fts5HashNew(Fts5Config *pConfig, Fts5Hash **ppNew, int *pnByte){
86   int rc = SQLITE_OK;
87   Fts5Hash *pNew;
88 
89   *ppNew = pNew = (Fts5Hash*)sqlite3_malloc(sizeof(Fts5Hash));
90   if( pNew==0 ){
91     rc = SQLITE_NOMEM;
92   }else{
93     sqlite3_int64 nByte;
94     memset(pNew, 0, sizeof(Fts5Hash));
95     pNew->pnByte = pnByte;
96     pNew->eDetail = pConfig->eDetail;
97 
98     pNew->nSlot = 1024;
99     nByte = sizeof(Fts5HashEntry*) * pNew->nSlot;
100     pNew->aSlot = (Fts5HashEntry**)sqlite3_malloc64(nByte);
101     if( pNew->aSlot==0 ){
102       sqlite3_free(pNew);
103       *ppNew = 0;
104       rc = SQLITE_NOMEM;
105     }else{
106       memset(pNew->aSlot, 0, (size_t)nByte);
107     }
108   }
109   return rc;
110 }
111 
112 /*
113 ** Free a hash table object.
114 */
sqlite3Fts5HashFree(Fts5Hash * pHash)115 void sqlite3Fts5HashFree(Fts5Hash *pHash){
116   if( pHash ){
117     sqlite3Fts5HashClear(pHash);
118     sqlite3_free(pHash->aSlot);
119     sqlite3_free(pHash);
120   }
121 }
122 
123 /*
124 ** Empty (but do not delete) a hash table.
125 */
sqlite3Fts5HashClear(Fts5Hash * pHash)126 void sqlite3Fts5HashClear(Fts5Hash *pHash){
127   int i;
128   for(i=0; i<pHash->nSlot; i++){
129     Fts5HashEntry *pNext;
130     Fts5HashEntry *pSlot;
131     for(pSlot=pHash->aSlot[i]; pSlot; pSlot=pNext){
132       pNext = pSlot->pHashNext;
133       sqlite3_free(pSlot);
134     }
135   }
136   memset(pHash->aSlot, 0, pHash->nSlot * sizeof(Fts5HashEntry*));
137   pHash->nEntry = 0;
138 }
139 
fts5HashKey(int nSlot,const u8 * p,int n)140 static unsigned int fts5HashKey(int nSlot, const u8 *p, int n){
141   int i;
142   unsigned int h = 13;
143   for(i=n-1; i>=0; i--){
144     h = (h << 3) ^ h ^ p[i];
145   }
146   return (h % nSlot);
147 }
148 
fts5HashKey2(int nSlot,u8 b,const u8 * p,int n)149 static unsigned int fts5HashKey2(int nSlot, u8 b, const u8 *p, int n){
150   int i;
151   unsigned int h = 13;
152   for(i=n-1; i>=0; i--){
153     h = (h << 3) ^ h ^ p[i];
154   }
155   h = (h << 3) ^ h ^ b;
156   return (h % nSlot);
157 }
158 
159 /*
160 ** Resize the hash table by doubling the number of slots.
161 */
fts5HashResize(Fts5Hash * pHash)162 static int fts5HashResize(Fts5Hash *pHash){
163   int nNew = pHash->nSlot*2;
164   int i;
165   Fts5HashEntry **apNew;
166   Fts5HashEntry **apOld = pHash->aSlot;
167 
168   apNew = (Fts5HashEntry**)sqlite3_malloc64(nNew*sizeof(Fts5HashEntry*));
169   if( !apNew ) return SQLITE_NOMEM;
170   memset(apNew, 0, nNew*sizeof(Fts5HashEntry*));
171 
172   for(i=0; i<pHash->nSlot; i++){
173     while( apOld[i] ){
174       unsigned int iHash;
175       Fts5HashEntry *p = apOld[i];
176       apOld[i] = p->pHashNext;
177       iHash = fts5HashKey(nNew, (u8*)fts5EntryKey(p),
178                           (int)strlen(fts5EntryKey(p)));
179       p->pHashNext = apNew[iHash];
180       apNew[iHash] = p;
181     }
182   }
183 
184   sqlite3_free(apOld);
185   pHash->nSlot = nNew;
186   pHash->aSlot = apNew;
187   return SQLITE_OK;
188 }
189 
fts5HashAddPoslistSize(Fts5Hash * pHash,Fts5HashEntry * p,Fts5HashEntry * p2)190 static int fts5HashAddPoslistSize(
191   Fts5Hash *pHash,
192   Fts5HashEntry *p,
193   Fts5HashEntry *p2
194 ){
195   int nRet = 0;
196   if( p->iSzPoslist ){
197     u8 *pPtr = p2 ? (u8*)p2 : (u8*)p;
198     int nData = p->nData;
199     if( pHash->eDetail==FTS5_DETAIL_NONE ){
200       assert( nData==p->iSzPoslist );
201       if( p->bDel ){
202         pPtr[nData++] = 0x00;
203         if( p->bContent ){
204           pPtr[nData++] = 0x00;
205         }
206       }
207     }else{
208       int nSz = (nData - p->iSzPoslist - 1);       /* Size in bytes */
209       int nPos = nSz*2 + p->bDel;                     /* Value of nPos field */
210 
211       assert( p->bDel==0 || p->bDel==1 );
212       if( nPos<=127 ){
213         pPtr[p->iSzPoslist] = (u8)nPos;
214       }else{
215         int nByte = sqlite3Fts5GetVarintLen((u32)nPos);
216         memmove(&pPtr[p->iSzPoslist + nByte], &pPtr[p->iSzPoslist + 1], nSz);
217         sqlite3Fts5PutVarint(&pPtr[p->iSzPoslist], nPos);
218         nData += (nByte-1);
219       }
220     }
221 
222     nRet = nData - p->nData;
223     if( p2==0 ){
224       p->iSzPoslist = 0;
225       p->bDel = 0;
226       p->bContent = 0;
227       p->nData = nData;
228     }
229   }
230   return nRet;
231 }
232 
233 /*
234 ** Add an entry to the in-memory hash table. The key is the concatenation
235 ** of bByte and (pToken/nToken). The value is (iRowid/iCol/iPos).
236 **
237 **     (bByte || pToken) -> (iRowid,iCol,iPos)
238 **
239 ** Or, if iCol is negative, then the value is a delete marker.
240 */
sqlite3Fts5HashWrite(Fts5Hash * pHash,i64 iRowid,int iCol,int iPos,char bByte,const char * pToken,int nToken)241 int sqlite3Fts5HashWrite(
242   Fts5Hash *pHash,
243   i64 iRowid,                     /* Rowid for this entry */
244   int iCol,                       /* Column token appears in (-ve -> delete) */
245   int iPos,                       /* Position of token within column */
246   char bByte,                     /* First byte of token */
247   const char *pToken, int nToken  /* Token to add or remove to or from index */
248 ){
249   unsigned int iHash;
250   Fts5HashEntry *p;
251   u8 *pPtr;
252   int nIncr = 0;                  /* Amount to increment (*pHash->pnByte) by */
253   int bNew;                       /* If non-delete entry should be written */
254 
255   bNew = (pHash->eDetail==FTS5_DETAIL_FULL);
256 
257   /* Attempt to locate an existing hash entry */
258   iHash = fts5HashKey2(pHash->nSlot, (u8)bByte, (const u8*)pToken, nToken);
259   for(p=pHash->aSlot[iHash]; p; p=p->pHashNext){
260     char *zKey = fts5EntryKey(p);
261     if( zKey[0]==bByte
262      && p->nKey==nToken
263      && memcmp(&zKey[1], pToken, nToken)==0
264     ){
265       break;
266     }
267   }
268 
269   /* If an existing hash entry cannot be found, create a new one. */
270   if( p==0 ){
271     /* Figure out how much space to allocate */
272     char *zKey;
273     sqlite3_int64 nByte = sizeof(Fts5HashEntry) + (nToken+1) + 1 + 64;
274     if( nByte<128 ) nByte = 128;
275 
276     /* Grow the Fts5Hash.aSlot[] array if necessary. */
277     if( (pHash->nEntry*2)>=pHash->nSlot ){
278       int rc = fts5HashResize(pHash);
279       if( rc!=SQLITE_OK ) return rc;
280       iHash = fts5HashKey2(pHash->nSlot, (u8)bByte, (const u8*)pToken, nToken);
281     }
282 
283     /* Allocate new Fts5HashEntry and add it to the hash table. */
284     p = (Fts5HashEntry*)sqlite3_malloc64(nByte);
285     if( !p ) return SQLITE_NOMEM;
286     memset(p, 0, sizeof(Fts5HashEntry));
287     p->nAlloc = (int)nByte;
288     zKey = fts5EntryKey(p);
289     zKey[0] = bByte;
290     memcpy(&zKey[1], pToken, nToken);
291     assert( iHash==fts5HashKey(pHash->nSlot, (u8*)zKey, nToken+1) );
292     p->nKey = nToken;
293     zKey[nToken+1] = '\0';
294     p->nData = nToken+1 + 1 + sizeof(Fts5HashEntry);
295     p->pHashNext = pHash->aSlot[iHash];
296     pHash->aSlot[iHash] = p;
297     pHash->nEntry++;
298 
299     /* Add the first rowid field to the hash-entry */
300     p->nData += sqlite3Fts5PutVarint(&((u8*)p)[p->nData], iRowid);
301     p->iRowid = iRowid;
302 
303     p->iSzPoslist = p->nData;
304     if( pHash->eDetail!=FTS5_DETAIL_NONE ){
305       p->nData += 1;
306       p->iCol = (pHash->eDetail==FTS5_DETAIL_FULL ? 0 : -1);
307     }
308 
309   }else{
310 
311     /* Appending to an existing hash-entry. Check that there is enough
312     ** space to append the largest possible new entry. Worst case scenario
313     ** is:
314     **
315     **     + 9 bytes for a new rowid,
316     **     + 4 byte reserved for the "poslist size" varint.
317     **     + 1 byte for a "new column" byte,
318     **     + 3 bytes for a new column number (16-bit max) as a varint,
319     **     + 5 bytes for the new position offset (32-bit max).
320     */
321     if( (p->nAlloc - p->nData) < (9 + 4 + 1 + 3 + 5) ){
322       sqlite3_int64 nNew = p->nAlloc * 2;
323       Fts5HashEntry *pNew;
324       Fts5HashEntry **pp;
325       pNew = (Fts5HashEntry*)sqlite3_realloc64(p, nNew);
326       if( pNew==0 ) return SQLITE_NOMEM;
327       pNew->nAlloc = (int)nNew;
328       for(pp=&pHash->aSlot[iHash]; *pp!=p; pp=&(*pp)->pHashNext);
329       *pp = pNew;
330       p = pNew;
331     }
332     nIncr -= p->nData;
333   }
334   assert( (p->nAlloc - p->nData) >= (9 + 4 + 1 + 3 + 5) );
335 
336   pPtr = (u8*)p;
337 
338   /* If this is a new rowid, append the 4-byte size field for the previous
339   ** entry, and the new rowid for this entry.  */
340   if( iRowid!=p->iRowid ){
341     u64 iDiff = (u64)iRowid - (u64)p->iRowid;
342     fts5HashAddPoslistSize(pHash, p, 0);
343     p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iDiff);
344     p->iRowid = iRowid;
345     bNew = 1;
346     p->iSzPoslist = p->nData;
347     if( pHash->eDetail!=FTS5_DETAIL_NONE ){
348       p->nData += 1;
349       p->iCol = (pHash->eDetail==FTS5_DETAIL_FULL ? 0 : -1);
350       p->iPos = 0;
351     }
352   }
353 
354   if( iCol>=0 ){
355     if( pHash->eDetail==FTS5_DETAIL_NONE ){
356       p->bContent = 1;
357     }else{
358       /* Append a new column value, if necessary */
359       assert_nc( iCol>=p->iCol );
360       if( iCol!=p->iCol ){
361         if( pHash->eDetail==FTS5_DETAIL_FULL ){
362           pPtr[p->nData++] = 0x01;
363           p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iCol);
364           p->iCol = (i16)iCol;
365           p->iPos = 0;
366         }else{
367           bNew = 1;
368           p->iCol = (i16)(iPos = iCol);
369         }
370       }
371 
372       /* Append the new position offset, if necessary */
373       if( bNew ){
374         p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iPos - p->iPos + 2);
375         p->iPos = iPos;
376       }
377     }
378   }else{
379     /* This is a delete. Set the delete flag. */
380     p->bDel = 1;
381   }
382 
383   nIncr += p->nData;
384   *pHash->pnByte += nIncr;
385   return SQLITE_OK;
386 }
387 
388 
389 /*
390 ** Arguments pLeft and pRight point to linked-lists of hash-entry objects,
391 ** each sorted in key order. This function merges the two lists into a
392 ** single list and returns a pointer to its first element.
393 */
fts5HashEntryMerge(Fts5HashEntry * pLeft,Fts5HashEntry * pRight)394 static Fts5HashEntry *fts5HashEntryMerge(
395   Fts5HashEntry *pLeft,
396   Fts5HashEntry *pRight
397 ){
398   Fts5HashEntry *p1 = pLeft;
399   Fts5HashEntry *p2 = pRight;
400   Fts5HashEntry *pRet = 0;
401   Fts5HashEntry **ppOut = &pRet;
402 
403   while( p1 || p2 ){
404     if( p1==0 ){
405       *ppOut = p2;
406       p2 = 0;
407     }else if( p2==0 ){
408       *ppOut = p1;
409       p1 = 0;
410     }else{
411       int i = 0;
412       char *zKey1 = fts5EntryKey(p1);
413       char *zKey2 = fts5EntryKey(p2);
414       while( zKey1[i]==zKey2[i] ) i++;
415 
416       if( ((u8)zKey1[i])>((u8)zKey2[i]) ){
417         /* p2 is smaller */
418         *ppOut = p2;
419         ppOut = &p2->pScanNext;
420         p2 = p2->pScanNext;
421       }else{
422         /* p1 is smaller */
423         *ppOut = p1;
424         ppOut = &p1->pScanNext;
425         p1 = p1->pScanNext;
426       }
427       *ppOut = 0;
428     }
429   }
430 
431   return pRet;
432 }
433 
434 /*
435 ** Extract all tokens from hash table iHash and link them into a list
436 ** in sorted order. The hash table is cleared before returning. It is
437 ** the responsibility of the caller to free the elements of the returned
438 ** list.
439 */
fts5HashEntrySort(Fts5Hash * pHash,const char * pTerm,int nTerm,Fts5HashEntry ** ppSorted)440 static int fts5HashEntrySort(
441   Fts5Hash *pHash,
442   const char *pTerm, int nTerm,   /* Query prefix, if any */
443   Fts5HashEntry **ppSorted
444 ){
445   const int nMergeSlot = 32;
446   Fts5HashEntry **ap;
447   Fts5HashEntry *pList;
448   int iSlot;
449   int i;
450 
451   *ppSorted = 0;
452   ap = sqlite3_malloc64(sizeof(Fts5HashEntry*) * nMergeSlot);
453   if( !ap ) return SQLITE_NOMEM;
454   memset(ap, 0, sizeof(Fts5HashEntry*) * nMergeSlot);
455 
456   for(iSlot=0; iSlot<pHash->nSlot; iSlot++){
457     Fts5HashEntry *pIter;
458     for(pIter=pHash->aSlot[iSlot]; pIter; pIter=pIter->pHashNext){
459       if( pTerm==0
460        || (pIter->nKey+1>=nTerm && 0==memcmp(fts5EntryKey(pIter), pTerm, nTerm))
461       ){
462         Fts5HashEntry *pEntry = pIter;
463         pEntry->pScanNext = 0;
464         for(i=0; ap[i]; i++){
465           pEntry = fts5HashEntryMerge(pEntry, ap[i]);
466           ap[i] = 0;
467         }
468         ap[i] = pEntry;
469       }
470     }
471   }
472 
473   pList = 0;
474   for(i=0; i<nMergeSlot; i++){
475     pList = fts5HashEntryMerge(pList, ap[i]);
476   }
477 
478   pHash->nEntry = 0;
479   sqlite3_free(ap);
480   *ppSorted = pList;
481   return SQLITE_OK;
482 }
483 
484 /*
485 ** Query the hash table for a doclist associated with term pTerm/nTerm.
486 */
sqlite3Fts5HashQuery(Fts5Hash * pHash,int nPre,const char * pTerm,int nTerm,void ** ppOut,int * pnDoclist)487 int sqlite3Fts5HashQuery(
488   Fts5Hash *pHash,                /* Hash table to query */
489   int nPre,
490   const char *pTerm, int nTerm,   /* Query term */
491   void **ppOut,                   /* OUT: Pointer to new object */
492   int *pnDoclist                  /* OUT: Size of doclist in bytes */
493 ){
494   unsigned int iHash = fts5HashKey(pHash->nSlot, (const u8*)pTerm, nTerm);
495   char *zKey = 0;
496   Fts5HashEntry *p;
497 
498   for(p=pHash->aSlot[iHash]; p; p=p->pHashNext){
499     zKey = fts5EntryKey(p);
500     assert( p->nKey+1==(int)strlen(zKey) );
501     if( nTerm==p->nKey+1 && memcmp(zKey, pTerm, nTerm)==0 ) break;
502   }
503 
504   if( p ){
505     int nHashPre = sizeof(Fts5HashEntry) + nTerm + 1;
506     int nList = p->nData - nHashPre;
507     u8 *pRet = (u8*)(*ppOut = sqlite3_malloc64(nPre + nList + 10));
508     if( pRet ){
509       Fts5HashEntry *pFaux = (Fts5HashEntry*)&pRet[nPre-nHashPre];
510       memcpy(&pRet[nPre], &((u8*)p)[nHashPre], nList);
511       nList += fts5HashAddPoslistSize(pHash, p, pFaux);
512       *pnDoclist = nList;
513     }else{
514       *pnDoclist = 0;
515       return SQLITE_NOMEM;
516     }
517   }else{
518     *ppOut = 0;
519     *pnDoclist = 0;
520   }
521 
522   return SQLITE_OK;
523 }
524 
sqlite3Fts5HashScanInit(Fts5Hash * p,const char * pTerm,int nTerm)525 int sqlite3Fts5HashScanInit(
526   Fts5Hash *p,                    /* Hash table to query */
527   const char *pTerm, int nTerm    /* Query prefix */
528 ){
529   return fts5HashEntrySort(p, pTerm, nTerm, &p->pScan);
530 }
531 
sqlite3Fts5HashScanNext(Fts5Hash * p)532 void sqlite3Fts5HashScanNext(Fts5Hash *p){
533   assert( !sqlite3Fts5HashScanEof(p) );
534   p->pScan = p->pScan->pScanNext;
535 }
536 
sqlite3Fts5HashScanEof(Fts5Hash * p)537 int sqlite3Fts5HashScanEof(Fts5Hash *p){
538   return (p->pScan==0);
539 }
540 
sqlite3Fts5HashScanEntry(Fts5Hash * pHash,const char ** pzTerm,const u8 ** ppDoclist,int * pnDoclist)541 void sqlite3Fts5HashScanEntry(
542   Fts5Hash *pHash,
543   const char **pzTerm,            /* OUT: term (nul-terminated) */
544   const u8 **ppDoclist,           /* OUT: pointer to doclist */
545   int *pnDoclist                  /* OUT: size of doclist in bytes */
546 ){
547   Fts5HashEntry *p;
548   if( (p = pHash->pScan) ){
549     char *zKey = fts5EntryKey(p);
550     int nTerm = (int)strlen(zKey);
551     fts5HashAddPoslistSize(pHash, p, 0);
552     *pzTerm = zKey;
553     *ppDoclist = (const u8*)&zKey[nTerm+1];
554     *pnDoclist = p->nData - (sizeof(Fts5HashEntry) + nTerm + 1);
555   }else{
556     *pzTerm = 0;
557     *ppDoclist = 0;
558     *pnDoclist = 0;
559   }
560 }
561