xref: /oneTBB/examples/graph/fgbzip2/blocksort.cpp (revision b15aabb3)
1 /*
2     Copyright (c) 2005-2021 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 /*-------------------------------------------------------------*/
18 /*--- Block sorting machinery                               ---*/
19 /*---                                         blocksort.cpp ---*/
20 /*-------------------------------------------------------------*/
21 
22 /* ------------------------------------------------------------------
23     The original source for this example:
24     This file is part of bzip2/libbzip2, a program and library for
25     lossless, block-sorting data compression.
26 
27     bzip2/libbzip2 version 1.0.6 of 6 September 2010
28     Copyright (C) 1996-2010 Julian Seward <[email protected]>
29 
30     This program, "bzip2", the associated library "libbzip2", and all
31     documentation, are copyright (C) 1996-2010 Julian R Seward.  All
32     rights reserved.
33 
34     Redistribution and use in source and binary forms, with or without
35     modification, are permitted provided that the following conditions
36     are met:
37 
38     1. Redistributions of source code must retain the above copyright
39     notice, this list of conditions and the following disclaimer.
40 
41     2. The origin of this software must not be misrepresented; you must
42     not claim that you wrote the original software.  If you use this
43     software in a product, an acknowledgment in the product
44     documentation would be appreciated but is not required.
45 
46     3. Altered source versions must be plainly marked as such, and must
47     not be misrepresented as being the original software.
48 
49     4. The name of the author may not be used to endorse or promote
50     products derived from this software without specific prior written
51     permission.
52 
53     THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
54     OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
55     WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56     ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
57     DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58     DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
59     GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
60     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
61     WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
62     NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
63     SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64 
65     Julian Seward, [email protected]
66     bzip2/libbzip2 version 1.0.6 of 6 September 2010
67    ------------------------------------------------------------------ */
68 
69 #include "bzlib_private.hpp"
70 
71 /*---------------------------------------------*/
72 /*--- Fallback O(N log(N)^2) sorting        ---*/
73 /*--- algorithm, for repetitive blocks      ---*/
74 /*---------------------------------------------*/
75 
76 /*---------------------------------------------*/
fallbackSimpleSort(UInt32 * fmap,UInt32 * eclass,Int32 lo,Int32 hi)77 static __inline__ void fallbackSimpleSort(UInt32* fmap, UInt32* eclass, Int32 lo, Int32 hi) {
78     Int32 i, j, tmp;
79     UInt32 ec_tmp;
80 
81     if (lo == hi)
82         return;
83 
84     if (hi - lo > 3) {
85         for (i = hi - 4; i >= lo; i--) {
86             tmp = fmap[i];
87             ec_tmp = eclass[tmp];
88             for (j = i + 4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4)
89                 fmap[j - 4] = fmap[j];
90             fmap[j - 4] = tmp;
91         }
92     }
93 
94     for (i = hi - 1; i >= lo; i--) {
95         tmp = fmap[i];
96         ec_tmp = eclass[tmp];
97         for (j = i + 1; j <= hi && ec_tmp > eclass[fmap[j]]; j++)
98             fmap[j - 1] = fmap[j];
99         fmap[j - 1] = tmp;
100     }
101 }
102 
103 /*---------------------------------------------*/
104 #define fswap(zz1, zz2)    \
105     {                      \
106         Int32 zztmp = zz1; \
107         zz1 = zz2;         \
108         zz2 = zztmp;       \
109     }
110 
111 #define fvswap(zzp1, zzp2, zzn)            \
112     {                                      \
113         Int32 yyp1 = (zzp1);               \
114         Int32 yyp2 = (zzp2);               \
115         Int32 yyn = (zzn);                 \
116         while (yyn > 0) {                  \
117             fswap(fmap[yyp1], fmap[yyp2]); \
118             yyp1++;                        \
119             yyp2++;                        \
120             yyn--;                         \
121         }                                  \
122     }
123 
124 #define fmin(a, b) ((a) < (b)) ? (a) : (b)
125 
126 #define fpush(lz, hz)     \
127     {                     \
128         stackLo[sp] = lz; \
129         stackHi[sp] = hz; \
130         sp++;             \
131     }
132 
133 #define fpop(lz, hz)      \
134     {                     \
135         sp--;             \
136         lz = stackLo[sp]; \
137         hz = stackHi[sp]; \
138     }
139 
140 #define FALLBACK_QSORT_SMALL_THRESH 10
141 #define FALLBACK_QSORT_STACK_SIZE   100
142 
fallbackQSort3(UInt32 * fmap,UInt32 * eclass,Int32 loSt,Int32 hiSt)143 static void fallbackQSort3(UInt32* fmap, UInt32* eclass, Int32 loSt, Int32 hiSt) {
144     Int32 unLo, unHi, ltLo, gtHi, n, m;
145     Int32 sp, lo, hi;
146     UInt32 med, r, r3;
147     Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
148     Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
149 
150     r = 0;
151 
152     sp = 0;
153     fpush(loSt, hiSt);
154 
155     while (sp > 0) {
156         AssertH(sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004);
157 
158         fpop(lo, hi);
159         if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
160             fallbackSimpleSort(fmap, eclass, lo, hi);
161             continue;
162         }
163 
164         /* Random partitioning.  Median of 3 sometimes fails to
165          avoid bad cases.  Median of 9 seems to help but
166          looks rather expensive.  This too seems to work but
167          is cheaper.  Guidance for the magic constants
168          7621 and 32768 is taken from Sedgewick's algorithms
169          book, chapter 35.
170       */
171         r = ((r * 7621) + 1) % 32768;
172         r3 = r % 3;
173         if (r3 == 0)
174             med = eclass[fmap[lo]];
175         else if (r3 == 1)
176             med = eclass[fmap[(lo + hi) >> 1]];
177         else
178             med = eclass[fmap[hi]];
179 
180         unLo = ltLo = lo;
181         unHi = gtHi = hi;
182 
183         while (1) {
184             while (1) {
185                 if (unLo > unHi)
186                     break;
187                 n = (Int32)eclass[fmap[unLo]] - (Int32)med;
188                 if (n == 0) {
189                     fswap(fmap[unLo], fmap[ltLo]);
190                     ltLo++;
191                     unLo++;
192                     continue;
193                 };
194                 if (n > 0)
195                     break;
196                 unLo++;
197             }
198             while (1) {
199                 if (unLo > unHi)
200                     break;
201                 n = (Int32)eclass[fmap[unHi]] - (Int32)med;
202                 if (n == 0) {
203                     fswap(fmap[unHi], fmap[gtHi]);
204                     gtHi--;
205                     unHi--;
206                     continue;
207                 };
208                 if (n < 0)
209                     break;
210                 unHi--;
211             }
212             if (unLo > unHi)
213                 break;
214             fswap(fmap[unLo], fmap[unHi]);
215             unLo++;
216             unHi--;
217         }
218 
219         AssertD(unHi == unLo - 1, "fallbackQSort3(2)");
220 
221         if (gtHi < ltLo)
222             continue;
223 
224         n = fmin(ltLo - lo, unLo - ltLo);
225         fvswap(lo, unLo - n, n);
226         m = fmin(hi - gtHi, gtHi - unHi);
227         fvswap(unLo, hi - m + 1, m);
228 
229         n = lo + unLo - ltLo - 1;
230         m = hi - (gtHi - unHi) + 1;
231 
232         if (n - lo > hi - m) {
233             fpush(lo, n);
234             fpush(m, hi);
235         }
236         else {
237             fpush(m, hi);
238             fpush(lo, n);
239         }
240     }
241 }
242 
243 #undef fmin
244 #undef fpush
245 #undef fpop
246 #undef fswap
247 #undef fvswap
248 #undef FALLBACK_QSORT_SMALL_THRESH
249 #undef FALLBACK_QSORT_STACK_SIZE
250 
251 /*---------------------------------------------*/
252 /* Pre:
253       nblock > 0
254       eclass exists for [0 .. nblock-1]
255       ((UChar*)eclass) [0 .. nblock-1] holds block
256       ptr exists for [0 .. nblock-1]
257 
258    Post:
259       ((UChar*)eclass) [0 .. nblock-1] holds block
260       All other areas of eclass destroyed
261       fmap [0 .. nblock-1] holds sorted order
262       bhtab [ 0 .. 2+(nblock/32) ] destroyed
263 */
264 
265 #define SET_BH(zz)       bhtab[(zz) >> 5] |= (1 << ((zz)&31))
266 #define CLEAR_BH(zz)     bhtab[(zz) >> 5] &= ~(1 << ((zz)&31))
267 #define ISSET_BH(zz)     (bhtab[(zz) >> 5] & (1 << ((zz)&31)))
268 #define WORD_BH(zz)      bhtab[(zz) >> 5]
269 #define UNALIGNED_BH(zz) ((zz)&0x01f)
270 
fallbackSort(UInt32 * fmap,UInt32 * eclass,UInt32 * bhtab,Int32 nblock,Int32 verb)271 static void fallbackSort(UInt32* fmap, UInt32* eclass, UInt32* bhtab, Int32 nblock, Int32 verb) {
272     Int32 ftab[257];
273     Int32 ftabCopy[256];
274     Int32 H, i, j, k, l, r, cc, cc1;
275     Int32 nNotDone;
276     Int32 nBhtab;
277     UChar* eclass8 = (UChar*)eclass;
278 
279     /*--
280       Initial 1-char radix sort to generate
281       initial fmap and initial BH bits.
282    --*/
283     if (verb >= 4)
284         VPrintf0("        bucket sorting ...\n");
285     for (i = 0; i < 257; i++)
286         ftab[i] = 0;
287     for (i = 0; i < nblock; i++)
288         ftab[eclass8[i]]++;
289     for (i = 0; i < 256; i++)
290         ftabCopy[i] = ftab[i];
291     for (i = 1; i < 257; i++)
292         ftab[i] += ftab[i - 1];
293 
294     for (i = 0; i < nblock; i++) {
295         j = eclass8[i];
296         k = ftab[j] - 1;
297         ftab[j] = k;
298         fmap[k] = i;
299     }
300 
301     nBhtab = 2 + (nblock / 32);
302     for (i = 0; i < nBhtab; i++)
303         bhtab[i] = 0;
304     for (i = 0; i < 256; i++)
305         SET_BH(ftab[i]);
306 
307     /*--
308       Inductively refine the buckets.  Kind-of an
309       "exponential radix sort" (!), inspired by the
310       Manber-Myers suffix array construction algorithm.
311    --*/
312 
313     /*-- set sentinel bits for block-end detection --*/
314     for (i = 0; i < 32; i++) {
315         SET_BH(nblock + 2 * i);
316         CLEAR_BH(nblock + 2 * i + 1);
317     }
318 
319     /*-- the log(N) loop --*/
320     H = 1;
321     while (1) {
322         if (verb >= 4)
323             VPrintf1("        depth %6d has ", H);
324 
325         j = 0;
326         for (i = 0; i < nblock; i++) {
327             if (ISSET_BH(i))
328                 j = i;
329             k = fmap[i] - H;
330             if (k < 0)
331                 k += nblock;
332             eclass[k] = j;
333         }
334 
335         nNotDone = 0;
336         r = -1;
337         while (1) {
338             /*-- find the next non-singleton bucket --*/
339             k = r + 1;
340             while (ISSET_BH(k) && UNALIGNED_BH(k))
341                 k++;
342             if (ISSET_BH(k)) {
343                 while (WORD_BH(k) == 0xffffffff)
344                     k += 32;
345                 while (ISSET_BH(k))
346                     k++;
347             }
348             l = k - 1;
349             if (l >= nblock)
350                 break;
351             while (!ISSET_BH(k) && UNALIGNED_BH(k))
352                 k++;
353             if (!ISSET_BH(k)) {
354                 while (WORD_BH(k) == 0x00000000)
355                     k += 32;
356                 while (!ISSET_BH(k))
357                     k++;
358             }
359             r = k - 1;
360             if (r >= nblock)
361                 break;
362 
363             /*-- now [l, r] bracket current bucket --*/
364             if (r > l) {
365                 nNotDone += (r - l + 1);
366                 fallbackQSort3(fmap, eclass, l, r);
367 
368                 /*-- scan bucket and generate header bits-- */
369                 cc = -1;
370                 for (i = l; i <= r; i++) {
371                     cc1 = eclass[fmap[i]];
372                     if (cc != cc1) {
373                         SET_BH(i);
374                         cc = cc1;
375                     };
376                 }
377             }
378         }
379 
380         if (verb >= 4)
381             VPrintf1("%6d unresolved strings\n", nNotDone);
382 
383         H *= 2;
384         if (H > nblock || nNotDone == 0)
385             break;
386     }
387 
388     /*--
389       Reconstruct the original block in
390       eclass8 [0 .. nblock-1], since the
391       previous phase destroyed it.
392    --*/
393     if (verb >= 4)
394         VPrintf0("        reconstructing block ...\n");
395     j = 0;
396     for (i = 0; i < nblock; i++) {
397         while (ftabCopy[j] == 0)
398             j++;
399         ftabCopy[j]--;
400         eclass8[fmap[i]] = (UChar)j;
401     }
402     AssertH(j < 256, 1005);
403 }
404 
405 #undef SET_BH
406 #undef CLEAR_BH
407 #undef ISSET_BH
408 #undef WORD_BH
409 #undef UNALIGNED_BH
410 
411 /*---------------------------------------------*/
412 /*--- The main, O(N^2 log(N)) sorting       ---*/
413 /*--- algorithm.  Faster for "normal"       ---*/
414 /*--- non-repetitive blocks.                ---*/
415 /*---------------------------------------------*/
416 
417 /*---------------------------------------------*/
mainGtU(UInt32 i1,UInt32 i2,UChar * block,UInt16 * quadrant,UInt32 nblock,Int32 * budget)418 static __inline__ Bool mainGtU(UInt32 i1,
419                                UInt32 i2,
420                                UChar* block,
421                                UInt16* quadrant,
422                                UInt32 nblock,
423                                Int32* budget) {
424     Int32 k;
425     UChar c1, c2;
426     UInt16 s1, s2;
427 
428     AssertD(i1 != i2, "mainGtU");
429     /* 1 */
430     c1 = block[i1];
431     c2 = block[i2];
432     if (c1 != c2)
433         return (c1 > c2);
434     i1++;
435     i2++;
436     /* 2 */
437     c1 = block[i1];
438     c2 = block[i2];
439     if (c1 != c2)
440         return (c1 > c2);
441     i1++;
442     i2++;
443     /* 3 */
444     c1 = block[i1];
445     c2 = block[i2];
446     if (c1 != c2)
447         return (c1 > c2);
448     i1++;
449     i2++;
450     /* 4 */
451     c1 = block[i1];
452     c2 = block[i2];
453     if (c1 != c2)
454         return (c1 > c2);
455     i1++;
456     i2++;
457     /* 5 */
458     c1 = block[i1];
459     c2 = block[i2];
460     if (c1 != c2)
461         return (c1 > c2);
462     i1++;
463     i2++;
464     /* 6 */
465     c1 = block[i1];
466     c2 = block[i2];
467     if (c1 != c2)
468         return (c1 > c2);
469     i1++;
470     i2++;
471     /* 7 */
472     c1 = block[i1];
473     c2 = block[i2];
474     if (c1 != c2)
475         return (c1 > c2);
476     i1++;
477     i2++;
478     /* 8 */
479     c1 = block[i1];
480     c2 = block[i2];
481     if (c1 != c2)
482         return (c1 > c2);
483     i1++;
484     i2++;
485     /* 9 */
486     c1 = block[i1];
487     c2 = block[i2];
488     if (c1 != c2)
489         return (c1 > c2);
490     i1++;
491     i2++;
492     /* 10 */
493     c1 = block[i1];
494     c2 = block[i2];
495     if (c1 != c2)
496         return (c1 > c2);
497     i1++;
498     i2++;
499     /* 11 */
500     c1 = block[i1];
501     c2 = block[i2];
502     if (c1 != c2)
503         return (c1 > c2);
504     i1++;
505     i2++;
506     /* 12 */
507     c1 = block[i1];
508     c2 = block[i2];
509     if (c1 != c2)
510         return (c1 > c2);
511     i1++;
512     i2++;
513 
514     k = nblock + 8;
515 
516     do {
517         /* 1 */
518         c1 = block[i1];
519         c2 = block[i2];
520         if (c1 != c2)
521             return (c1 > c2);
522         s1 = quadrant[i1];
523         s2 = quadrant[i2];
524         if (s1 != s2)
525             return (s1 > s2);
526         i1++;
527         i2++;
528         /* 2 */
529         c1 = block[i1];
530         c2 = block[i2];
531         if (c1 != c2)
532             return (c1 > c2);
533         s1 = quadrant[i1];
534         s2 = quadrant[i2];
535         if (s1 != s2)
536             return (s1 > s2);
537         i1++;
538         i2++;
539         /* 3 */
540         c1 = block[i1];
541         c2 = block[i2];
542         if (c1 != c2)
543             return (c1 > c2);
544         s1 = quadrant[i1];
545         s2 = quadrant[i2];
546         if (s1 != s2)
547             return (s1 > s2);
548         i1++;
549         i2++;
550         /* 4 */
551         c1 = block[i1];
552         c2 = block[i2];
553         if (c1 != c2)
554             return (c1 > c2);
555         s1 = quadrant[i1];
556         s2 = quadrant[i2];
557         if (s1 != s2)
558             return (s1 > s2);
559         i1++;
560         i2++;
561         /* 5 */
562         c1 = block[i1];
563         c2 = block[i2];
564         if (c1 != c2)
565             return (c1 > c2);
566         s1 = quadrant[i1];
567         s2 = quadrant[i2];
568         if (s1 != s2)
569             return (s1 > s2);
570         i1++;
571         i2++;
572         /* 6 */
573         c1 = block[i1];
574         c2 = block[i2];
575         if (c1 != c2)
576             return (c1 > c2);
577         s1 = quadrant[i1];
578         s2 = quadrant[i2];
579         if (s1 != s2)
580             return (s1 > s2);
581         i1++;
582         i2++;
583         /* 7 */
584         c1 = block[i1];
585         c2 = block[i2];
586         if (c1 != c2)
587             return (c1 > c2);
588         s1 = quadrant[i1];
589         s2 = quadrant[i2];
590         if (s1 != s2)
591             return (s1 > s2);
592         i1++;
593         i2++;
594         /* 8 */
595         c1 = block[i1];
596         c2 = block[i2];
597         if (c1 != c2)
598             return (c1 > c2);
599         s1 = quadrant[i1];
600         s2 = quadrant[i2];
601         if (s1 != s2)
602             return (s1 > s2);
603         i1++;
604         i2++;
605 
606         if (i1 >= nblock)
607             i1 -= nblock;
608         if (i2 >= nblock)
609             i2 -= nblock;
610 
611         k -= 8;
612         (*budget)--;
613     } while (k >= 0);
614 
615     return False;
616 }
617 
618 /*---------------------------------------------*/
619 /*--
620    Knuth's increments seem to work better
621    than Incerpi-Sedgewick here.  Possibly
622    because the number of elements to sort
623    is usually small, typically <= 20.
624 --*/
625 static Int32 incs[14] = { 1,    4,    13,    40,    121,    364,    1093,
626                           3280, 9841, 29524, 88573, 265720, 797161, 2391484 };
627 
mainSimpleSort(UInt32 * ptr,UChar * block,UInt16 * quadrant,Int32 nblock,Int32 lo,Int32 hi,Int32 d,Int32 * budget)628 static void mainSimpleSort(UInt32* ptr,
629                            UChar* block,
630                            UInt16* quadrant,
631                            Int32 nblock,
632                            Int32 lo,
633                            Int32 hi,
634                            Int32 d,
635                            Int32* budget) {
636     Int32 i, j, h, bigN, hp;
637     UInt32 v;
638 
639     bigN = hi - lo + 1;
640     if (bigN < 2)
641         return;
642 
643     hp = 0;
644     while (incs[hp] < bigN)
645         hp++;
646     hp--;
647 
648     for (; hp >= 0; hp--) {
649         h = incs[hp];
650 
651         i = lo + h;
652         while (True) {
653             /*-- copy 1 --*/
654             if (i > hi)
655                 break;
656             v = ptr[i];
657             j = i;
658             while (mainGtU(ptr[j - h] + d, v + d, block, quadrant, nblock, budget)) {
659                 ptr[j] = ptr[j - h];
660                 j = j - h;
661                 if (j <= (lo + h - 1))
662                     break;
663             }
664             ptr[j] = v;
665             i++;
666 
667             /*-- copy 2 --*/
668             if (i > hi)
669                 break;
670             v = ptr[i];
671             j = i;
672             while (mainGtU(ptr[j - h] + d, v + d, block, quadrant, nblock, budget)) {
673                 ptr[j] = ptr[j - h];
674                 j = j - h;
675                 if (j <= (lo + h - 1))
676                     break;
677             }
678             ptr[j] = v;
679             i++;
680 
681             /*-- copy 3 --*/
682             if (i > hi)
683                 break;
684             v = ptr[i];
685             j = i;
686             while (mainGtU(ptr[j - h] + d, v + d, block, quadrant, nblock, budget)) {
687                 ptr[j] = ptr[j - h];
688                 j = j - h;
689                 if (j <= (lo + h - 1))
690                     break;
691             }
692             ptr[j] = v;
693             i++;
694 
695             if (*budget < 0)
696                 return;
697         }
698     }
699 }
700 
701 /*---------------------------------------------*/
702 /*--
703    The following is an implementation of
704    an elegant 3-way quicksort for strings,
705    described in a paper "Fast Algorithms for
706    Sorting and Searching Strings", by Robert
707    Sedgewick and Jon L. Bentley.
708 --*/
709 
710 #define mswap(zz1, zz2)    \
711     {                      \
712         Int32 zztmp = zz1; \
713         zz1 = zz2;         \
714         zz2 = zztmp;       \
715     }
716 
717 #define mvswap(zzp1, zzp2, zzn)          \
718     {                                    \
719         Int32 yyp1 = (zzp1);             \
720         Int32 yyp2 = (zzp2);             \
721         Int32 yyn = (zzn);               \
722         while (yyn > 0) {                \
723             mswap(ptr[yyp1], ptr[yyp2]); \
724             yyp1++;                      \
725             yyp2++;                      \
726             yyn--;                       \
727         }                                \
728     }
729 
mmed3(UChar a,UChar b,UChar c)730 static __inline__ UChar mmed3(UChar a, UChar b, UChar c) {
731     UChar t;
732     if (a > b) {
733         t = a;
734         a = b;
735         b = t;
736     };
737     if (b > c) {
738         b = c;
739         if (a > b)
740             b = a;
741     }
742     return b;
743 }
744 
745 #define mmin(a, b) ((a) < (b)) ? (a) : (b)
746 
747 #define mpush(lz, hz, dz) \
748     {                     \
749         stackLo[sp] = lz; \
750         stackHi[sp] = hz; \
751         stackD[sp] = dz;  \
752         sp++;             \
753     }
754 
755 #define mpop(lz, hz, dz)  \
756     {                     \
757         sp--;             \
758         lz = stackLo[sp]; \
759         hz = stackHi[sp]; \
760         dz = stackD[sp];  \
761     }
762 
763 #define mnextsize(az) (nextHi[az] - nextLo[az])
764 
765 #define mnextswap(az, bz)        \
766     {                            \
767         Int32 tz;                \
768         tz = nextLo[az];         \
769         nextLo[az] = nextLo[bz]; \
770         nextLo[bz] = tz;         \
771         tz = nextHi[az];         \
772         nextHi[az] = nextHi[bz]; \
773         nextHi[bz] = tz;         \
774         tz = nextD[az];          \
775         nextD[az] = nextD[bz];   \
776         nextD[bz] = tz;          \
777     }
778 
779 #define MAIN_QSORT_SMALL_THRESH 20
780 #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
781 #define MAIN_QSORT_STACK_SIZE   100
782 
mainQSort3(UInt32 * ptr,UChar * block,UInt16 * quadrant,Int32 nblock,Int32 loSt,Int32 hiSt,Int32 dSt,Int32 * budget)783 static void mainQSort3(UInt32* ptr,
784                        UChar* block,
785                        UInt16* quadrant,
786                        Int32 nblock,
787                        Int32 loSt,
788                        Int32 hiSt,
789                        Int32 dSt,
790                        Int32* budget) {
791     Int32 unLo, unHi, ltLo, gtHi, n, m, med;
792     Int32 sp, lo, hi, d;
793 
794     Int32 stackLo[MAIN_QSORT_STACK_SIZE];
795     Int32 stackHi[MAIN_QSORT_STACK_SIZE];
796     Int32 stackD[MAIN_QSORT_STACK_SIZE];
797 
798     Int32 nextLo[3];
799     Int32 nextHi[3];
800     Int32 nextD[3];
801 
802     sp = 0;
803     mpush(loSt, hiSt, dSt);
804 
805     while (sp > 0) {
806         AssertH(sp < MAIN_QSORT_STACK_SIZE - 2, 1001);
807 
808         mpop(lo, hi, d);
809         if (hi - lo < MAIN_QSORT_SMALL_THRESH || d > MAIN_QSORT_DEPTH_THRESH) {
810             mainSimpleSort(ptr, block, quadrant, nblock, lo, hi, d, budget);
811             if (*budget < 0)
812                 return;
813             continue;
814         }
815 
816         med = (Int32)mmed3(block[ptr[lo] + d], block[ptr[hi] + d], block[ptr[(lo + hi) >> 1] + d]);
817 
818         unLo = ltLo = lo;
819         unHi = gtHi = hi;
820 
821         while (True) {
822             while (True) {
823                 if (unLo > unHi)
824                     break;
825                 n = ((Int32)block[ptr[unLo] + d]) - med;
826                 if (n == 0) {
827                     mswap(ptr[unLo], ptr[ltLo]);
828                     ltLo++;
829                     unLo++;
830                     continue;
831                 };
832                 if (n > 0)
833                     break;
834                 unLo++;
835             }
836             while (True) {
837                 if (unLo > unHi)
838                     break;
839                 n = ((Int32)block[ptr[unHi] + d]) - med;
840                 if (n == 0) {
841                     mswap(ptr[unHi], ptr[gtHi]);
842                     gtHi--;
843                     unHi--;
844                     continue;
845                 };
846                 if (n < 0)
847                     break;
848                 unHi--;
849             }
850             if (unLo > unHi)
851                 break;
852             mswap(ptr[unLo], ptr[unHi]);
853             unLo++;
854             unHi--;
855         }
856 
857         AssertD(unHi == unLo - 1, "mainQSort3(2)");
858 
859         if (gtHi < ltLo) {
860             mpush(lo, hi, d + 1);
861             continue;
862         }
863 
864         n = mmin(ltLo - lo, unLo - ltLo);
865         mvswap(lo, unLo - n, n);
866         m = mmin(hi - gtHi, gtHi - unHi);
867         mvswap(unLo, hi - m + 1, m);
868 
869         n = lo + unLo - ltLo - 1;
870         m = hi - (gtHi - unHi) + 1;
871 
872         nextLo[0] = lo;
873         nextHi[0] = n;
874         nextD[0] = d;
875         nextLo[1] = m;
876         nextHi[1] = hi;
877         nextD[1] = d;
878         nextLo[2] = n + 1;
879         nextHi[2] = m - 1;
880         nextD[2] = d + 1;
881 
882         if (mnextsize(0) < mnextsize(1))
883             mnextswap(0, 1);
884         if (mnextsize(1) < mnextsize(2))
885             mnextswap(1, 2);
886         if (mnextsize(0) < mnextsize(1))
887             mnextswap(0, 1);
888 
889         AssertD(mnextsize(0) >= mnextsize(1), "mainQSort3(8)");
890         AssertD(mnextsize(1) >= mnextsize(2), "mainQSort3(9)");
891 
892         mpush(nextLo[0], nextHi[0], nextD[0]);
893         mpush(nextLo[1], nextHi[1], nextD[1]);
894         mpush(nextLo[2], nextHi[2], nextD[2]);
895     }
896 }
897 
898 #undef mswap
899 #undef mvswap
900 #undef mpush
901 #undef mpop
902 #undef mmin
903 #undef mnextsize
904 #undef mnextswap
905 #undef MAIN_QSORT_SMALL_THRESH
906 #undef MAIN_QSORT_DEPTH_THRESH
907 #undef MAIN_QSORT_STACK_SIZE
908 
909 /*---------------------------------------------*/
910 /* Pre:
911       nblock > N_OVERSHOOT
912       block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
913       ((UChar*)block32) [0 .. nblock-1] holds block
914       ptr exists for [0 .. nblock-1]
915 
916    Post:
917       ((UChar*)block32) [0 .. nblock-1] holds block
918       All other areas of block32 destroyed
919       ftab [0 .. 65536 ] destroyed
920       ptr [0 .. nblock-1] holds sorted order
921       if (*budget < 0), sorting was abandoned
922 */
923 
924 #define BIGFREQ(b) (ftab[((b) + 1) << 8] - ftab[(b) << 8])
925 #define SETMASK    (1 << 21)
926 #define CLEARMASK  (~(SETMASK))
927 
mainSort(UInt32 * ptr,UChar * block,UInt16 * quadrant,UInt32 * ftab,Int32 nblock,Int32 verb,Int32 * budget)928 static void mainSort(UInt32* ptr,
929                      UChar* block,
930                      UInt16* quadrant,
931                      UInt32* ftab,
932                      Int32 nblock,
933                      Int32 verb,
934                      Int32* budget) {
935     Int32 i, j, k, ss, sb;
936     Int32 runningOrder[256];
937     Bool bigDone[256];
938     Int32 copyStart[256];
939     Int32 copyEnd[256];
940     UChar c1;
941     Int32 numQSorted;
942     UInt16 s;
943     if (verb >= 4)
944         VPrintf0("        main sort initialise ...\n");
945 
946     /*-- set up the 2-byte frequency table --*/
947     for (i = 65536; i >= 0; i--)
948         ftab[i] = 0;
949 
950     j = block[0] << 8;
951     i = nblock - 1;
952     for (; i >= 3; i -= 4) {
953         quadrant[i] = 0;
954         j = (j >> 8) | (((UInt16)block[i]) << 8);
955         ftab[j]++;
956         quadrant[i - 1] = 0;
957         j = (j >> 8) | (((UInt16)block[i - 1]) << 8);
958         ftab[j]++;
959         quadrant[i - 2] = 0;
960         j = (j >> 8) | (((UInt16)block[i - 2]) << 8);
961         ftab[j]++;
962         quadrant[i - 3] = 0;
963         j = (j >> 8) | (((UInt16)block[i - 3]) << 8);
964         ftab[j]++;
965     }
966     for (; i >= 0; i--) {
967         quadrant[i] = 0;
968         j = (j >> 8) | (((UInt16)block[i]) << 8);
969         ftab[j]++;
970     }
971 
972     /*-- (emphasises close relationship of block & quadrant) --*/
973     for (i = 0; i < BZ_N_OVERSHOOT; i++) {
974         block[nblock + i] = block[i];
975         quadrant[nblock + i] = 0;
976     }
977 
978     if (verb >= 4)
979         VPrintf0("        bucket sorting ...\n");
980 
981     /*-- Complete the initial radix sort --*/
982     for (i = 1; i <= 65536; i++)
983         ftab[i] += ftab[i - 1];
984 
985     s = block[0] << 8;
986     i = nblock - 1;
987     for (; i >= 3; i -= 4) {
988         s = (s >> 8) | (block[i] << 8);
989         j = ftab[s] - 1;
990         ftab[s] = j;
991         ptr[j] = i;
992         s = (s >> 8) | (block[i - 1] << 8);
993         j = ftab[s] - 1;
994         ftab[s] = j;
995         ptr[j] = i - 1;
996         s = (s >> 8) | (block[i - 2] << 8);
997         j = ftab[s] - 1;
998         ftab[s] = j;
999         ptr[j] = i - 2;
1000         s = (s >> 8) | (block[i - 3] << 8);
1001         j = ftab[s] - 1;
1002         ftab[s] = j;
1003         ptr[j] = i - 3;
1004     }
1005     for (; i >= 0; i--) {
1006         s = (s >> 8) | (block[i] << 8);
1007         j = ftab[s] - 1;
1008         ftab[s] = j;
1009         ptr[j] = i;
1010     }
1011 
1012     /*--
1013       Now ftab contains the first loc of every small bucket.
1014       Calculate the running order, from smallest to largest
1015       big bucket.
1016    --*/
1017     for (i = 0; i <= 255; i++) {
1018         bigDone[i] = False;
1019         runningOrder[i] = i;
1020     }
1021 
1022     {
1023         Int32 vv;
1024         Int32 h = 1;
1025         do
1026             h = 3 * h + 1;
1027         while (h <= 256);
1028         do {
1029             h = h / 3;
1030             for (i = h; i <= 255; i++) {
1031                 vv = runningOrder[i];
1032                 j = i;
1033                 while (BIGFREQ(runningOrder[j - h]) > BIGFREQ(vv)) {
1034                     runningOrder[j] = runningOrder[j - h];
1035                     j = j - h;
1036                     if (j <= (h - 1))
1037                         goto zero;
1038                 }
1039             zero:
1040                 runningOrder[j] = vv;
1041             }
1042         } while (h != 1);
1043     }
1044 
1045     /*--
1046       The main sorting loop.
1047    --*/
1048 
1049     numQSorted = 0;
1050 
1051     for (i = 0; i <= 255; i++) {
1052         /*--
1053          Process big buckets, starting with the least full.
1054          Basically this is a 3-step process in which we call
1055          mainQSort3 to sort the small buckets [ss, j], but
1056          also make a big effort to avoid the calls if we can.
1057       --*/
1058         ss = runningOrder[i];
1059 
1060         /*--
1061          Step 1:
1062          Complete the big bucket [ss] by quicksorting
1063          any unsorted small buckets [ss, j], for j != ss.
1064          Hopefully previous pointer-scanning phases have already
1065          completed many of the small buckets [ss, j], so
1066          we don't have to sort them at all.
1067       --*/
1068         for (j = 0; j <= 255; j++) {
1069             if (j != ss) {
1070                 sb = (ss << 8) + j;
1071                 if (!(ftab[sb] & SETMASK)) {
1072                     Int32 lo = ftab[sb] & CLEARMASK;
1073                     Int32 hi = (ftab[sb + 1] & CLEARMASK) - 1;
1074                     if (hi > lo) {
1075                         if (verb >= 4)
1076                             VPrintf4("        qsort [0x%x, 0x%x]   "
1077                                      "done %d   this %d\n",
1078                                      ss,
1079                                      j,
1080                                      numQSorted,
1081                                      hi - lo + 1);
1082                         mainQSort3(ptr, block, quadrant, nblock, lo, hi, BZ_N_RADIX, budget);
1083                         numQSorted += (hi - lo + 1);
1084                         if (*budget < 0)
1085                             return;
1086                     }
1087                 }
1088                 ftab[sb] |= SETMASK;
1089             }
1090         }
1091 
1092         AssertH(!bigDone[ss], 1006);
1093 
1094         /*--
1095          Step 2:
1096          Now scan this big bucket [ss] so as to synthesise the
1097          sorted order for small buckets [t, ss] for all t,
1098          including, magically, the bucket [ss,ss] too.
1099          This will avoid doing Real Work in subsequent Step 1's.
1100       --*/
1101         {
1102             for (j = 0; j <= 255; j++) {
1103                 copyStart[j] = ftab[(j << 8) + ss] & CLEARMASK;
1104                 copyEnd[j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
1105             }
1106             for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
1107                 k = ptr[j] - 1;
1108                 if (k < 0)
1109                     k += nblock;
1110                 c1 = block[k];
1111                 if (!bigDone[c1])
1112                     ptr[copyStart[c1]++] = k;
1113             }
1114             for (j = (ftab[(ss + 1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
1115                 k = ptr[j] - 1;
1116                 if (k < 0)
1117                     k += nblock;
1118                 c1 = block[k];
1119                 if (!bigDone[c1])
1120                     ptr[copyEnd[c1]--] = k;
1121             }
1122         }
1123 
1124         AssertH((copyStart[ss] - 1 == copyEnd[ss]) ||
1125                     /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
1126                    Necessity for this case is demonstrated by compressing
1127                    a sequence of approximately 48.5 million of character
1128                    251; 1.0.0/1.0.1 will then die here. */
1129                     (copyStart[ss] == 0 && copyEnd[ss] == nblock - 1),
1130                 1007)
1131 
1132             for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
1133 
1134         /*--
1135          Step 3:
1136          The [ss] big bucket is now done.  Record this fact,
1137          and update the quadrant descriptors.  Remember to
1138          update quadrants in the overshoot area too, if
1139          necessary.  The "if (i < 255)" test merely skips
1140          this updating for the last bucket processed, since
1141          updating for the last bucket is pointless.
1142 
1143          The quadrant array provides a way to incrementally
1144          cache sort orderings, as they appear, so as to
1145          make subsequent comparisons in fullGtU() complete
1146          faster.  For repetitive blocks this makes a big
1147          difference (but not big enough to be able to avoid
1148          the fallback sorting mechanism, exponential radix sort).
1149 
1150          The precise meaning is: at all times:
1151 
1152             for 0 <= i < nblock and 0 <= j <= nblock
1153 
1154             if block[i] != block[j],
1155 
1156                then the relative values of quadrant[i] and
1157                     quadrant[j] are meaningless.
1158 
1159                else {
1160                   if quadrant[i] < quadrant[j]
1161                      then the string starting at i lexicographically
1162                      precedes the string starting at j
1163 
1164                   else if quadrant[i] > quadrant[j]
1165                      then the string starting at j lexicographically
1166                      precedes the string starting at i
1167 
1168                   else
1169                      the relative ordering of the strings starting
1170                      at i and j has not yet been determined.
1171                }
1172       --*/
1173         bigDone[ss] = True;
1174 
1175         if (i < 255) {
1176             Int32 bbStart = ftab[ss << 8] & CLEARMASK;
1177             Int32 bbSize = (ftab[(ss + 1) << 8] & CLEARMASK) - bbStart;
1178             Int32 shifts = 0;
1179 
1180             while ((bbSize >> shifts) > 65534)
1181                 shifts++;
1182 
1183             for (j = bbSize - 1; j >= 0; j--) {
1184                 Int32 a2update = ptr[bbStart + j];
1185                 UInt16 qVal = (UInt16)(j >> shifts);
1186                 quadrant[a2update] = qVal;
1187                 if (a2update < BZ_N_OVERSHOOT)
1188                     quadrant[a2update + nblock] = qVal;
1189             }
1190             AssertH(((bbSize - 1) >> shifts) <= 65535, 1002);
1191         }
1192     }
1193 
1194     if (verb >= 4)
1195         VPrintf3("        %d pointers, %d sorted, %d scanned\n",
1196                  nblock,
1197                  numQSorted,
1198                  nblock - numQSorted);
1199 }
1200 
1201 #undef BIGFREQ
1202 #undef SETMASK
1203 #undef CLEARMASK
1204 
1205 /*---------------------------------------------*/
1206 /* Pre:
1207       nblock > 0
1208       arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1209       ((UChar*)arr2)  [0 .. nblock-1] holds block
1210       arr1 exists for [0 .. nblock-1]
1211 
1212    Post:
1213       ((UChar*)arr2) [0 .. nblock-1] holds block
1214       All other areas of block destroyed
1215       ftab [ 0 .. 65536 ] destroyed
1216       arr1 [0 .. nblock-1] holds sorted order
1217 */
BZ2_blockSort(EState * s)1218 void BZ2_blockSort(EState* s) {
1219     UInt32* ptr = s->ptr;
1220     UChar* block = s->block;
1221     UInt32* ftab = s->ftab;
1222     Int32 nblock = s->nblock;
1223     Int32 verb = s->verbosity;
1224     Int32 wfact = s->workFactor;
1225     UInt16* quadrant;
1226     Int32 budget;
1227     Int32 budgetInit;
1228     Int32 i;
1229 
1230     if (nblock < 10000) {
1231         fallbackSort(s->arr1, s->arr2, ftab, nblock, verb);
1232     }
1233     else {
1234         /* Calculate the location for quadrant, remembering to get
1235          the alignment right.  Assumes that &(block[0]) is at least
1236          2-byte aligned -- this should be ok since block is really
1237          the first section of arr2.
1238       */
1239         i = nblock + BZ_N_OVERSHOOT;
1240         if (i & 1)
1241             i++;
1242         quadrant = (UInt16*)(&(block[i]));
1243 
1244         /* (wfact-1) / 3 puts the default-factor-30
1245          transition point at very roughly the same place as
1246          with v0.1 and v0.9.0.
1247          Not that it particularly matters any more, since the
1248          resulting compressed stream is now the same regardless
1249          of whether or not we use the main sort or fallback sort.
1250       */
1251         if (wfact < 1)
1252             wfact = 1;
1253         if (wfact > 100)
1254             wfact = 100;
1255         budgetInit = nblock * ((wfact - 1) / 3);
1256         budget = budgetInit;
1257 
1258         mainSort(ptr, block, quadrant, ftab, nblock, verb, &budget);
1259         if (verb >= 3)
1260             VPrintf3("      %d work, %d block, ratio %5.2f\n",
1261                      budgetInit - budget,
1262                      nblock,
1263                      (float)(budgetInit - budget) / (float)(nblock == 0 ? 1 : nblock));
1264         if (budget < 0) {
1265             if (verb >= 2)
1266                 VPrintf0("    too repetitive; using fallback"
1267                          " sorting algorithm\n");
1268             fallbackSort(s->arr1, s->arr2, ftab, nblock, verb);
1269         }
1270     }
1271 
1272     s->origPtr = -1;
1273     for (i = 0; i < s->nblock; i++)
1274         if (ptr[i] == 0) {
1275             s->origPtr = i;
1276             break;
1277         };
1278 
1279     AssertH(s->origPtr != -1, 1003);
1280 }
1281 
1282 /*-------------------------------------------------------------*/
1283 /*--- end                                       blocksort.c ---*/
1284 /*-------------------------------------------------------------*/
1285